Setup

load sero-epi functions

source("seroepi_functions.R")

set colour palettes

# source: https://davidmathlogic.com/colorblind/#%23D81B60-%231E88E5-%23FFC107-%23004D40
region_cols <- c(`Southern Asia` ="#D81B60", `Western Africa`="#1E88E5", `Eastern Africa`="#FFC107", `Southern Africa`="#81B1A9", Global="black")
group_cols <- c(All="black", Fatal ="#D81B60", ESBL="#1E88E5", CP="#FFC107")

load data on included samples - Table S3

data_NNS_sites10 <- read_tsv("tables/TableS3_IsolatesIncluded_NEEDSACCESSIONS.tsv")
## Rows: 1930 Columns: 36
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (29): Genome Name, Study, Region, Country, Cluster, neonatal, K_locus, K...
## dbl  (7): Site, Year, N50, contig_count, total_size, resistance_score, Morta...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

K locus analysis

read posterior estimates for adjusted counts (main) and raw counts (for comparison)

# read adjusted Bayesian estimates for K
kbayes <- parseModelledEstimates(global_post="outputs_core/K_Full_ALL_adj_28_posterior_global.csv.gz", region_post="outputs_core/K_Full_ALL_adj_28_posterior_subgroup.csv.gz")
## Rows: 1080000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4320000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
# read raw Bayesian estimates for K
kbayes_raw <- parseModelledEstimates(global_post="outputs_core/K_Full_ALL_raw_28_posterior_global.csv.gz", region_post="outputs_core/K_Full_ALL_raw_28_posterior_subgroup.csv.gz")
## Rows: 1080000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4320000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.

global and regional prevalence - top20 K

k_prev_global_dist <- locus_ridgesplot(posterior=kbayes$global_post,
                                   ranks=kbayes$locus_rank, 
                                   lines_every=10, 
                                   xlim=c(0,12), xbreaks=seq(0,12,1), scale=1.5,
                                   title = "a) Global estimates")
## Joining with `by = join_by(locus)`
## Picking joint bandwidth of 0.0717
## Warning: Removed 46 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

k_prev_region_heatmap <- region_heatmap(kbayes$region_est, kbayes$global_est, 
                                   ranks=kbayes$locus_rank, median=F, title="b) Point estimates")
## Warning: Removed 300 rows containing missing values or values outside the scale range
## (`geom_tile()`).
## Warning: Removed 300 rows containing missing values or values outside the scale range
## (`geom_text()`).

fig 2

k_prev_global_dist + k_prev_region_heatmap
## Picking joint bandwidth of 0.0717
## Warning: Removed 46 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
## Warning: Removed 300 rows containing missing values or values outside the scale range
## (`geom_tile()`).
## Warning: Removed 300 rows containing missing values or values outside the scale range
## (`geom_text()`).

ggsave("figures/Fig2_K_global_regional_Kadj.pdf", width=8, height=6)
## Picking joint bandwidth of 0.0717
## Warning: Removed 46 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
## Warning: Removed 300 rows containing missing values or values outside the scale range
## (`geom_tile()`).
## Warning: Removed 300 rows containing missing values or values outside the scale range
## (`geom_text()`).
ggsave("figures/Fig2_K_global_regional_Kadj.png", width=8, height=6)
## Picking joint bandwidth of 0.0717
## Warning: Removed 46 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
## Warning: Removed 300 rows containing missing values or values outside the scale range
## (`geom_tile()`).
## Warning: Removed 300 rows containing missing values or values outside the scale range
## (`geom_text()`).

global and regional prevalence - all K

k_prev_global_dist_all <- locus_ridgesplot(posterior=kbayes$global_post,
                                   ranks=kbayes$locus_rank,
                                   maxRank = 90, axis_font_size=8,
                                   lines_every=10, 
                                   xlim=c(0,12), xbreaks=seq(0,12,1), scale=3, title="a) Global estimates")
## Joining with `by = join_by(locus)`
## Picking joint bandwidth of 0.04
## Warning: Removed 46 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

k_prev_region_heatmap_all <- region_heatmap(kbayes$region_est, kbayes$global_est,
                                   ranks=kbayes$locus_rank,
                                   maxRank = 90, axis_font_size=8, median=F, title="b) Point estimates")

Figure S2 - modelled estimates for all K loci

k_prev_global_dist_all + k_prev_region_heatmap_all
## Picking joint bandwidth of 0.04
## Warning: Removed 46 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

ggsave("figures/FigS2_K_global_regional_Kadj.pdf", width=8, height=10)
## Picking joint bandwidth of 0.04
## Warning: Removed 46 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
ggsave("figures/FigS2_K_global_regional_Kadj.png", width=8, height=10)
## Picking joint bandwidth of 0.04
## Warning: Removed 46 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

compare global prevalence distributions using cluster-adjusted and raw counts

# plot raw vs adjusted distributions, overlaid - for appendix s2.4b
k_dist_raw_adj <- comparative_locus_ridgesplot(posterior1=kbayes$global_post,
                             posterior2=kbayes_raw$global_post,
                             ranks=kbayes$locus_rank, 
                             lines_every=10, xlim=c(0,20), xbreaks=seq(0,20,1))
## Joining with `by = join_by(locus)`
## Picking joint bandwidth of 0.0586
## Warning: Removed 58 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

logistic regression of K vs year + region

result <- c()
for (k in kbayes$locus_rank$locus[1:30]) {
  d <- data_NNS_sites10 %>%
    mutate(k=if_else(K_locus==k, 1, 0)) %>%
    select(k, Year, Region)
  m <- summary(logistf(k~Year+Region, data=d))
  result <- rbind(result, cbind(coef=m$coefficients, p=m$prob, locus=k))
}
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef    se(coef)    lower 0.95  upper 0.95
## (Intercept)           -37.88700712 62.94467272 -161.20903016 86.48590232
## Year                    0.01803655  0.03119787   -0.04361092  0.07915654
## RegionSouthern Africa  -1.10817272  0.26921197   -1.66752642 -0.60562423
## RegionSouthern Asia    -2.84833150  0.40634815   -3.75158569 -2.13078305
## RegionWestern Africa   -1.29017996  0.49177143   -2.41049936 -0.43538436
##                            Chisq            p method
## (Intercept)            0.3591336 5.489878e-01      2
## Year                   0.3313495 5.648655e-01      2
## RegionSouthern Africa 20.8012324 5.095017e-06      2
## RegionSouthern Asia          Inf 0.000000e+00      2
## RegionWestern Africa   9.8847074 1.666580e-03      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=129.0551 on 4 df, p=0, n=1930
## Wald test = 634.4304 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa, RegionSouthern Asia, RegionWestern Africa exceeded. Try
## to increase the number of iterations by passing 'logistpl.control(maxit=...)'
## to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef    se(coef)   lower 0.95  upper 0.95
## (Intercept)           130.000000000 42.35322992 -370.0000000 630.0000000
## Year                   -0.064612245  0.02099325   -0.3126068   0.1829403
## RegionSouthern Africa  -0.005222793  0.15315520   -9.9309302   2.2267548
## RegionSouthern Asia    -0.136507533  0.11892282  -10.9495878   1.4778546
## RegionWestern Africa   -0.115089058  0.24308356  -26.1063908   4.8158145
##                           Chisq            p method
## (Intercept)            0.000000 1.000000e+00      2
## Year                   0.000000 1.000000e+00      2
## RegionSouthern Africa  1.454453 2.278148e-01      2
## RegionSouthern Asia   52.980701 3.368417e-13      2
## RegionWestern Africa   4.038985 4.446057e-02      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-332.7095 on 4 df, p=1, n=1930
## Wald test = 104.6445 on 4 df, p = 0logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef     se(coef)   lower 0.95   upper 0.95
## (Intercept)           13.293360660 100.51294006 -184.4396000 214.10728127
## Year                  -0.008099124   0.04982077   -0.1076478   0.08989806
## RegionSouthern Africa  1.561731650   0.24805532    1.0763103   2.05431012
## RegionSouthern Asia   -0.757212766   0.35535994   -1.5072348  -0.09533611
## RegionWestern Africa  -0.862409383   0.83530080   -3.0509975   0.46804954
##                            Chisq          p method
## (Intercept)           0.01716419 0.89576566      2
## Year                  0.02593953 0.87204809      2
## RegionSouthern Africa        Inf 0.00000000      2
## RegionSouthern Asia   5.10052299 0.02391863      2
## RegionWestern Africa  1.38804125 0.23873616      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=66.95806 on 4 df, p=9.947598e-14, n=1930
## Wald test = 706.3714 on 4 df, p = 0logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef    se(coef)    lower 0.95   upper 0.95
## (Intercept)           -5.836391670 68.60219601 -141.05894975 129.13571265
## Year                   0.001570549  0.03400289   -0.06533417   0.06858862
## RegionSouthern Africa -2.424355266  0.83115117   -4.60668849  -1.11156195
## RegionSouthern Asia    1.540401523  0.17270952    1.20440845   1.88320180
## RegionWestern Africa  -0.725214512  0.65381614   -2.31059404   0.36532572
##                              Chisq            p method
## (Intercept)            0.007184999 9.324487e-01      2
## Year                   0.002117681 9.632957e-01      2
## RegionSouthern Africa 19.178123536 1.190702e-05      2
## RegionSouthern Asia            Inf 0.000000e+00      2
## RegionWestern Africa   1.521914247 2.173300e-01      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=147.8983 on 4 df, p=0, n=1930
## Wald test = 665.7401 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year exceeded.
## Try to increase the number of iterations by passing
## 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef    se(coef)   lower 0.95  upper 0.95
## (Intercept)           -130.00000000 81.81519927 -630.0000000 370.0000000
## Year                     0.06305563  0.04054606   -0.1851299   0.3105408
## RegionSouthern Africa    0.29538106  0.26948902   -2.0553070   2.6780374
## RegionSouthern Asia      0.12550575  0.22392517   -1.8835321   2.2145361
## RegionWestern Africa     0.39566655  0.41869039   -4.9012053   3.4107789
##                           Chisq            p method
## (Intercept)            0.000000 1.000000e+00      2
## Year                   0.000000 1.000000e+00      2
## RegionSouthern Africa 28.299026 1.039483e-07      2
## RegionSouthern Asia   15.063837 1.039355e-04      2
## RegionWestern Africa   4.088582 4.317385e-02      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-24.24784 on 4 df, p=1, n=1930
## Wald test = 857.3845 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionWestern Africa exceeded. Try to increase the number of iterations by
## passing 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef    se(coef)   lower 0.95  upper 0.95 Chisq
## (Intercept)           -130.00000000 82.17544596 -630.0000000 370.0000000     0
## Year                     0.06316023  0.04072477   -0.1849201   0.3107086     0
## RegionSouthern Africa   -0.11865221  0.28135867   -4.6453979   2.4021033     0
## RegionSouthern Asia     -0.58058645  0.25044206   -9.5774304   1.3155810     0
## RegionWestern Africa    -0.54912634  0.55397731  -42.5317904   2.9448386     0
##                       p method
## (Intercept)           1      2
## Year                  1      2
## RegionSouthern Africa 1      2
## RegionSouthern Asia   1      2
## RegionWestern Africa  1      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-36.60871 on 4 df, p=1, n=1930
## Wald test = 851.3438 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year exceeded.
## Try to increase the number of iterations by passing
## 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef     se(coef)   lower 0.95  upper 0.95
## (Intercept)           -130.00000000 113.34292036 -630.0000000 370.0000000
## Year                     0.06260313   0.05616994   -0.1856303   0.3100522
## RegionSouthern Africa    0.06377154   0.43064858   -4.5265784   3.3025814
## RegionSouthern Asia      0.74218717   0.29006127   -0.6282241   3.8778357
## RegionWestern Africa     0.33087275   0.64094786   -8.6563226   4.2429066
##                           Chisq            p method
## (Intercept)            0.000000 1.000000e+00      2
## Year                   0.000000 1.000000e+00      2
## RegionSouthern Africa  2.348115 1.254346e-01      2
## RegionSouthern Asia   68.763252 1.110223e-16      2
## RegionWestern Africa   1.494815 2.214708e-01      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=0.0388931 on 4 df, p=0.9998133, n=1930
## Wald test = 708.9709 on 4 df, p = 0logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef     se(coef)   lower 0.95  upper 0.95
## (Intercept)           -9.287309e+01 150.19917308 -394.0382572 210.9756308
## Year                   4.401260e-02   0.07443808   -0.1066044   0.1932379
## RegionSouthern Africa -1.668985e-05   0.55212092   -1.2093728   1.0171781
## RegionSouthern Asia   -3.992089e-02   0.43784822   -0.9523869   0.8016919
## RegionWestern Africa   1.494779e+00   0.49911715    0.4141775   2.4189995
##                              Chisq           p method
## (Intercept)           3.633107e-01 0.546673301      2
## Year                  3.322166e-01 0.564356778      2
## RegionSouthern Africa 4.179225e-08 0.999836887      2
## RegionSouthern Asia   8.177574e-03 0.927945565      2
## RegionWestern Africa  6.802541e+00 0.009102825      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=7.490749 on 4 df, p=0.1121179, n=1930
## Wald test = 546.3554 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa exceeded. Try to increase the number of iterations by
## passing 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef    se(coef)   lower 0.95  upper 0.95
## (Intercept)           130.00000000 95.90941917 -370.0000000 630.0000000
## Year                   -0.06581753  0.04754515   -0.3138735   0.1818001
## RegionSouthern Africa  -0.72863282  0.43193324  -40.1044448   1.5512142
## RegionSouthern Asia    -0.58153184  0.30071691   -7.7393960   1.1304430
## RegionWestern Africa   -0.33267553  0.57723135  -13.7947501   2.5832885
##                           Chisq            p method
## (Intercept)            0.000000 1.000000e+00      2
## Year                   0.000000 1.000000e+00      2
## RegionSouthern Africa 49.074572 2.464140e-12      2
## RegionSouthern Asia   56.343948 6.084022e-14      2
## RegionWestern Africa   2.798747 9.433798e-02      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-4.102337 on 4 df, p=1, n=1930
## Wald test = 784.6969 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef     se(coef)  lower 0.95  upper 0.95
## (Intercept)           -86.56364422 184.50293426 -481.823818 277.1169491
## Year                    0.03963728   0.09143892   -0.140649   0.2354744
## RegionSouthern Africa   0.31768391   1.63625155   -4.677210   3.2789784
## RegionSouthern Asia     3.49603759   0.85205684    2.086662   5.7147520
## RegionWestern Africa    1.55807953   1.62129221   -3.428864   4.4986615
##                            Chisq         p method
## (Intercept)           0.20476609 0.6509010      2
## Year                  0.17451300 0.6761317      2
## RegionSouthern Africa 0.03524504 0.8510831      2
## RegionSouthern Asia          Inf 0.0000000      2
## RegionWestern Africa  0.67652911 0.4107845      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=48.42003 on 4 df, p=7.714304e-10, n=1930
## Wald test = 297.622 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa, RegionSouthern Asia, RegionWestern Africa exceeded. Try
## to increase the number of iterations by passing 'logistpl.control(maxit=...)'
## to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef    se(coef)   lower 0.95  upper 0.95 Chisq p
## (Intercept)           130.00000000 78.04801956 -370.0000000 630.0000000     0 1
## Year                   -0.06577325  0.03869063   -0.3139322   0.1816988     0 1
## RegionSouthern Africa   0.18520086  0.29709283  -57.3074714   3.3731811     0 1
## RegionSouthern Asia     0.63212472  0.20477903   -3.0375828   6.2111558     0 1
## RegionWestern Africa    0.07147776  0.46816261  -46.7892963  19.9593452     0 1
##                       method
## (Intercept)                2
## Year                       2
## RegionSouthern Africa      2
## RegionSouthern Asia        2
## RegionWestern Africa       2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-116.5062 on 4 df, p=1, n=1930
## Wald test = 859.8824 on 4 df, p = 0logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef    se(coef)   lower 0.95  upper 0.95
## (Intercept)           -2.5289717314 163.3541682 -327.3014277 335.1864314
## Year                  -0.0006874461   0.0809675   -0.1681138   0.1602526
## RegionSouthern Africa -0.0547657083   0.5534894   -1.2668681   0.9693052
## RegionSouthern Asia   -1.3429350411   0.6823067   -2.9727427  -0.1560814
## RegionWestern Africa   0.5352844642   0.6755352   -1.0819827   1.6977178
##                              Chisq          p method
## (Intercept)           2.262995e-04 0.98799767      2
## Year                  6.810321e-05 0.99341556      2
## RegionSouthern Africa 9.740342e-03 0.92138189      2
## RegionSouthern Asia   5.070673e+00 0.02433393      2
## RegionWestern Africa  5.433242e-01 0.46105832      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=6.847377 on 4 df, p=0.1441769, n=1930
## Wald test = 497.3219 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa, RegionWestern Africa exceeded. Try to increase the
## number of iterations by passing 'logistpl.control(maxit=...)' to parameter
## plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef     se(coef)  lower 0.95 upper 0.95 Chisq p
## (Intercept)           130.00000000 134.22619840 -370.000000 630.000000     0 1
## Year                   -0.06653586   0.06654095   -0.314773   0.180839     0 1
## RegionSouthern Africa   2.95363495   0.33867139    2.177617  17.499040   Inf 0
## RegionSouthern Asia     0.45165127   0.41957356   -4.459098  14.069911     0 1
## RegionWestern Africa    0.32950543   0.85060573  -32.033487  14.407974     0 1
##                       method
## (Intercept)                2
## Year                       2
## RegionSouthern Africa      2
## RegionSouthern Asia        2
## RegionWestern Africa       2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=138.6139 on 4 df, p=0, n=1930
## Wald test = 551.9099 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year exceeded.
## Try to increase the number of iterations by passing
## 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef     se(coef)   lower 0.95  upper 0.95
## (Intercept)           -130.00000000 150.18908447 -630.0000000 278.9166630
## Year                     0.06231615   0.07442974   -0.1405737   0.3098365
## RegionSouthern Africa   -0.41165962   0.67317470   -3.8333338   1.2735644
## RegionSouthern Asia      0.78842620   0.37774531   -0.1097688   2.1725314
## RegionWestern Africa    -0.26447518   1.08897622   -7.9268974   2.1026305
##                          Chisq            p method
## (Intercept)            0.00000 1.000000e+00      2
## Year                   0.00000 1.000000e+00      2
## RegionSouthern Africa  0.00000 1.000000e+00      2
## RegionSouthern Asia   33.14629 8.547941e-09      2
## RegionWestern Africa   0.00000 1.000000e+00      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=8.514077 on 4 df, p=0.07446169, n=1930
## Wald test = 547.9156 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year exceeded.
## Try to increase the number of iterations by passing
## 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef     se(coef)   lower 0.95  upper 0.95
## (Intercept)           -130.00000000 100.55350447 -630.0000000 370.0000000
## Year                     0.06288445   0.04983212   -0.1852318   0.3104054
## RegionSouthern Africa   -0.29650010   0.37953944  -11.5947192   2.8680071
## RegionSouthern Asia     -0.19711050   0.28599828   -5.4876196   2.5693238
## RegionWestern Africa     0.28176425   0.50752231  -10.1203269   4.2209916
##                          Chisq         p method
## (Intercept)           0.000000 1.0000000      2
## Year                  0.000000 1.0000000      2
## RegionSouthern Africa 0.000000 1.0000000      2
## RegionSouthern Asia   0.000000 1.0000000      2
## RegionWestern Africa  2.333477 0.1266188      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-37.75674 on 4 df, p=1, n=1930
## Wald test = 780.8683 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa, RegionSouthern Asia exceeded. Try to increase the number
## of iterations by passing 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef     se(coef)   lower 0.95  upper 0.95 Chisq
## (Intercept)           130.00000000 131.07456157 -370.0000000 630.0000000     0
## Year                   -0.06629896   0.06497838   -0.3144327   0.1812064     0
## RegionSouthern Africa   0.64407943   0.42148886   -2.4845919   3.2026937     0
## RegionSouthern Asia     0.18928824   0.36855386   -3.9269408   3.5431429     0
## RegionWestern Africa    0.76517970   0.57302894   -5.2859816   5.1747592     0
##                       p method
## (Intercept)           1      2
## Year                  1      2
## RegionSouthern Africa 1      2
## RegionSouthern Asia   1      2
## RegionWestern Africa  1      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-15.84383 on 4 df, p=1, n=1930
## Wald test = 655.9728 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa, RegionWestern Africa exceeded. Try to increase the
## number of iterations by passing 'logistpl.control(maxit=...)' to parameter
## plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef     se(coef)   lower 0.95  upper 0.95
## (Intercept)           130.000000000 108.70208142 -370.0000000 630.0000000
## Year                   -0.066153380   0.05388737   -0.3143147   0.1813291
## RegionSouthern Africa  -0.033681532   0.45180851  -40.8127398   3.4352329
## RegionSouthern Asia     0.695420328   0.28164204   -1.4916843   6.0597885
## RegionWestern Africa    0.001321003   0.68123142  -39.3723701   5.9850061
##                          Chisq         p method
## (Intercept)           0.000000 1.0000000      2
## Year                  0.000000 1.0000000      2
## RegionSouthern Africa 1.743079 0.1867492      2
## RegionSouthern Asia   0.000000 1.0000000      2
## RegionWestern Africa  0.000000 1.0000000      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-24.96682 on 4 df, p=1, n=1930
## Wald test = 732.9121 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year exceeded.
## Try to increase the number of iterations by passing
## 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef     se(coef)   lower 0.95  upper 0.95
## (Intercept)           -1.300000e+02 113.03460149 -630.0000000 370.0000000
## Year                   6.273860e-02   0.05601728   -0.1853879   0.3102525
## RegionSouthern Africa -3.728792e-01   0.44620658  -13.2287359   2.5699286
## RegionSouthern Asia    4.646229e-03   0.30913211   -3.2410869   2.6855528
## RegionWestern Africa   1.485499e-01   0.61133533  -11.7871856   3.8904989
##                           Chisq         p method
## (Intercept)           0.0000000 1.0000000      2
## Year                  0.0000000 1.0000000      2
## RegionSouthern Africa 0.0000000 1.0000000      2
## RegionSouthern Asia   0.3399628 0.5598507      2
## RegionWestern Africa  0.7492826 0.3867035      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-24.08208 on 4 df, p=1, n=1930
## Wald test = 720.5392 on 4 df, p = 0logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                              coef     se(coef)   lower 0.95  upper 0.95
## (Intercept)           67.42616492 191.61169358 -318.6943552 470.4228275
## Year                  -0.03552461   0.09498226   -0.2353401   0.1558267
## RegionSouthern Africa -0.23958403   0.71929929   -1.9145684   1.0540654
## RegionSouthern Asia   -0.60408646   0.61429099   -1.9958828   0.5249492
## RegionWestern Africa  -0.74958430   1.42915330   -5.6030294   1.2752906
##                           Chisq         p method
## (Intercept)           0.1142159 0.7353945      2
## Year                  0.1290975 0.7193696      2
## RegionSouthern Africa 0.1134622 0.7362364      2
## RegionSouthern Asia   1.0633022 0.3024632      2
## RegionWestern Africa  0.3430248 0.5580884      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=1.85572 on 4 df, p=0.7622738, n=1930
## Wald test = 432.0288 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Asia exceeded. Try to increase the number of iterations by
## passing 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef   se(coef)   lower 0.95  upper 0.95
## (Intercept)           -130.00000000 97.1257619 -630.0000000 370.0000000
## Year                     0.06287866  0.0481334   -0.1852788   0.3103755
## RegionSouthern Africa    0.02759329  0.3412474   -8.4368436   4.9140508
## RegionSouthern Asia      0.10724356  0.2636664   -3.6992784   4.7415529
## RegionWestern Africa     0.15826796  0.5349839  -22.5771802   5.9224465
##                           Chisq            p method
## (Intercept)            0.000000 1.0000000000      2
## Year                   0.000000 1.0000000000      2
## RegionSouthern Africa  2.205763 0.1374958471      2
## RegionSouthern Asia   13.072036 0.0002997371      2
## RegionWestern Africa   1.243564 0.2647852482      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-52.44613 on 4 df, p=1, n=1930
## Wald test = 801.223 on 4 df, p = 0logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                              coef     se(coef)   lower 0.95  upper 0.95
## (Intercept)           40.16497678 187.65434397 -336.3843993 434.6257289
## Year                  -0.02198032   0.09301729   -0.2175561   0.1646218
## RegionSouthern Africa  0.26081306   0.57750519   -0.9884602   1.3551362
## RegionSouthern Asia   -1.03248446   0.69590913   -2.6843960   0.2005478
## RegionWestern Africa  -0.82037254   1.42802169   -5.6727860   1.1984511
##                            Chisq         p method
## (Intercept)           0.04241190 0.8368366      2
## Year                  0.05172378 0.8200905      2
## RegionSouthern Africa 0.19340481 0.6600978      2
## RegionSouthern Asia   2.73361260 0.0982568      2
## RegionWestern Africa  0.42208547 0.5158983      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=4.147128 on 4 df, p=0.3864603, n=1930
## Wald test = 443.674 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa, RegionSouthern Asia exceeded. Try to increase the number
## of iterations by passing 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef    se(coef)   lower 0.95 upper 0.95
## (Intercept)           -130.00000000 97.33432906 -630.0000000 370.000000
## Year                     0.06291597  0.04823681   -0.1851852   0.310441
## RegionSouthern Africa   -0.34211618  0.37426273  -27.7561357   3.881645
## RegionSouthern Asia     -0.45603242  0.29941085  -43.7377732   2.975136
## RegionWestern Africa     1.25529523  0.35249581   -3.1626974   6.155618
##                          Chisq           p method
## (Intercept)            0.00000 1.00000e+00      2
## Year                   0.00000 1.00000e+00      2
## RegionSouthern Africa  0.00000 1.00000e+00      2
## RegionSouthern Asia    0.00000 1.00000e+00      2
## RegionWestern Africa  30.33557 3.63401e-08      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-29.15767 on 4 df, p=1, n=1930
## Wald test = 776.6436 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa, RegionSouthern Asia, RegionWestern Africa exceeded. Try
## to increase the number of iterations by passing 'logistpl.control(maxit=...)'
## to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef    se(coef)   lower 0.95  upper 0.95
## (Intercept)           -130.00000000 79.26079190 -630.0000000 370.0000000
## Year                     0.06308596  0.03928019   -0.1850915   0.3105714
## RegionSouthern Africa    0.82281458  0.23134895   -1.6159109  81.4641308
## RegionSouthern Asia      0.07893681  0.22038916   -6.9120299 157.9068728
## RegionWestern Africa     0.05212533  0.46369973  -47.6618229  94.4174770
##                            Chisq            p method
## (Intercept)            0.0000000 1.0000000000      2
## Year                   0.0000000 1.0000000000      2
## RegionSouthern Africa        Inf 0.0000000000      2
## RegionSouthern Asia   17.1494083 0.0000345517      2
## RegionWestern Africa   0.6679147 0.4137795700      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-97.70746 on 4 df, p=1, n=1930
## Wald test = 848.6715 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year exceeded.
## Try to increase the number of iterations by passing
## 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                              coef    se(coef)   lower 0.95  upper 0.95
## (Intercept)           37.75387494 227.4609149 -430.9132528 525.4459673
## Year                  -0.02111177   0.1127486   -0.2629266   0.2111244
## RegionSouthern Africa -1.29173684   1.4713700   -6.1747336   0.9015368
## RegionSouthern Asia   -0.02587252   0.6623794   -1.4935255   1.2481205
## RegionWestern Africa   1.48137361   0.7221376   -0.1998597   2.7963714
##                             Chisq         p method
## (Intercept)           0.024391227 0.8758936      2
## Year                  0.031056464 0.8601145      2
## RegionSouthern Africa 1.074239425 0.2999898      2
## RegionSouthern Asia   0.001448938 0.9696359      2
## RegionWestern Africa  3.101150142 0.0782370      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=4.971146 on 4 df, p=0.2902709, n=1930
## Wald test = 338.3153 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year exceeded.
## Try to increase the number of iterations by passing
## 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef    se(coef)   lower 0.95  upper 0.95
## (Intercept)           127.28937147 215.2263317 -372.7106285 627.2893715
## Year                   -0.06556842   0.1066958   -0.3136308   0.1820858
## RegionSouthern Africa  -0.40981472   1.0897222   -6.4813708   1.8397821
## RegionSouthern Asia     0.93212232   0.5504222   -0.2233164   2.6005360
## RegionWestern Africa    0.24025986   1.2879280   -5.4493903   2.7180379
##                          Chisq         p method
## (Intercept)           0.000000 1.0000000      2
## Year                  0.000000 1.0000000      2
## RegionSouthern Africa 1.611557 0.2042731      2
## RegionSouthern Asia   2.479686 0.1153253      2
## RegionWestern Africa  0.000000 1.0000000      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=3.7693 on 4 df, p=0.438127, n=1930
## Wald test = 370.1267 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa, RegionSouthern Asia, RegionWestern Africa exceeded. Try
## to increase the number of iterations by passing 'logistpl.control(maxit=...)'
## to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef    se(coef)   lower 0.95 upper 0.95
## (Intercept)           -130.00000000 65.34167150 -630.0000000 370.000000
## Year                     0.06327475  0.03238243   -0.1849337   0.310748
## RegionSouthern Africa   -0.12595693  0.25215434  -44.8215034 185.227699
## RegionSouthern Asia      0.61534944  0.16881614   -1.2996132 186.488972
## RegionWestern Africa     0.02607048  0.39894575  -45.7886646  92.480010
##                           Chisq         p method
## (Intercept)           0.0000000 1.0000000      2
## Year                  0.0000000 1.0000000      2
## RegionSouthern Africa 0.0000000 1.0000000      2
## RegionSouthern Asia         Inf 0.0000000      2
## RegionWestern Africa  0.4584689 0.4983404      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-138.2533 on 4 df, p=1, n=1930
## Wald test = 832.8149 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa, RegionWestern Africa exceeded. Try to increase the
## number of iterations by passing 'logistpl.control(maxit=...)' to parameter
## plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef    se(coef)   lower 0.95  upper 0.95 Chisq p
## (Intercept)           130.00000000 119.2853913 -370.0000000 630.0000000     0 1
## Year                   -0.06625107   0.0591340   -0.3144155   0.1812235     0 1
## RegionSouthern Africa   0.30789860   0.4426178   -8.2555518   3.6189367     0 1
## RegionSouthern Asia     0.64359073   0.3119323   -1.6509401   6.2207866     0 1
## RegionWestern Africa    0.06128208   0.7290114  -38.6917511   6.2159535     0 1
##                       method
## (Intercept)                2
## Year                       2
## RegionSouthern Africa      2
## RegionSouthern Asia        2
## RegionWestern Africa       2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-23.67887 on 4 df, p=1, n=1930
## Wald test = 693.3726 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year exceeded.
## Try to increase the number of iterations by passing
## 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef    se(coef)   lower 0.95 upper 0.95
## (Intercept)           126.34968980 227.1797625 -373.6503102 626.349690
## Year                   -0.06582648   0.1126214   -0.3142313   0.181512
## RegionSouthern Africa   1.66136824   1.0665348   -0.6940896   9.080411
## RegionSouthern Asia     2.80818174   0.7935580    1.6670722  10.185614
## RegionWestern Africa    1.60510674   1.4781118   -3.7621368   9.207465
##                            Chisq            p method
## (Intercept)            0.0000000 1.000000e+00      2
## Year                   0.0000000 1.000000e+00      2
## RegionSouthern Africa  2.1209410 1.452972e-01      2
## RegionSouthern Asia   27.8831621 1.288663e-07      2
## RegionWestern Africa   0.7488254 3.868484e-01      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=20.48004 on 4 df, p=0.0004014047, n=1930
## Wald test = 296.3746 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year exceeded.
## Try to increase the number of iterations by passing
## 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef     se(coef)  lower 0.95  upper 0.95
## (Intercept)           130.00000000 115.59195956 -355.235652 630.0000000
## Year                   -0.06596543   0.05730267   -0.313986   0.1743997
## RegionSouthern Africa  -0.89015265   0.53288501   -7.872837   0.9249096
## RegionSouthern Asia    -1.27786915   0.44476257  -20.237517  -0.2057696
## RegionWestern Africa   -0.40619838   0.68538874   -6.861860   1.7952887
##                           Chisq            p method
## (Intercept)            0.000000 1.000000e+00      2
## Year                   0.000000 1.000000e+00      2
## RegionSouthern Africa 36.971556 1.198651e-09      2
## RegionSouthern Asia   67.396484 2.220446e-16      2
## RegionWestern Africa   2.122844 1.451168e-01      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=19.57295 on 4 df, p=0.000606272, n=1930
## Wald test = 673.5065 on 4 df, p = 0
## Warning in logistf(k ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(k ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa, RegionSouthern Asia, RegionWestern Africa exceeded. Try
## to increase the number of iterations by passing 'logistpl.control(maxit=...)'
## to parameter plcontrol
## logistf(formula = k ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef    se(coef)   lower 0.95  upper 0.95 Chisq
## (Intercept)           -130.00000000 49.04076208 -630.0000000 370.0000000     0
## Year                     0.06383803  0.02430516   -0.1842753   0.3114068     0
## RegionSouthern Africa   -0.32326558  0.18069720  -53.5228447   5.5652948     0
## RegionSouthern Asia     -0.26716240  0.13874266  -31.7520349   5.4974921     0
## RegionWestern Africa    -0.19657118  0.28903769  -45.2972609   5.3490828     0
##                       p method
## (Intercept)           1      2
## Year                  1      2
## RegionSouthern Africa 1      2
## RegionSouthern Asia   1      2
## RegionWestern Africa  1      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-526.3923 on 4 df, p=1, n=1930
## Wald test = 541.8715 on 4 df, p = 0
logistic_sig <- as_tibble(result, rownames="variable") %>% filter(p<0.05) %>% left_join(kbayes$locus_rank)
## Joining with `by = join_by(locus)`
logistic_sig %>% filter(p<0.05)
## # A tibble: 15 × 5
##    variable              coef               p                    locus  rank
##    <chr>                 <chr>              <chr>                <chr> <int>
##  1 RegionSouthern Asia   -2.84833149909426  0                    KL2       1
##  2 RegionWestern Africa  -1.29017996241314  0.00166658030282141  KL2       1
##  3 RegionWestern Africa  -0.115089058476192 0.0444605650057359   KL102     2
##  4 RegionSouthern Africa 1.56173164966805   0                    KL25      3
##  5 RegionSouthern Asia   -0.757212766403901 0.0239186306965349   KL25      3
##  6 RegionSouthern Asia   1.54040152282165   0                    KL15      4
##  7 RegionSouthern Asia   0.125505745865047  0.000103935505882391 KL62      5
##  8 RegionWestern Africa  0.395666546573237  0.0431738528553842   KL62      5
##  9 RegionWestern Africa  1.49477887691326   0.00910282464090573  KL17      8
## 10 RegionSouthern Asia   3.49603759035185   0                    KL51     10
## 11 RegionSouthern Asia   -1.34293504105872  0.0243339302006811   KL39     12
## 12 RegionSouthern Africa 2.95363495302061   0                    KL149    13
## 13 RegionSouthern Asia   0.107243564111339  0.000299737092402941 KL48     20
## 14 RegionSouthern Africa 0.822814575488765  0                    KL8      23
## 15 RegionSouthern Asia   0.615349436699138  0                    KL81     26
p<-0.02

region_2pc <- kbayes$region_est %>% select(locus, subgroup, mean) %>% 
  pivot_wider(id_cols=locus, names_from=subgroup, values_from = mean) %>% 
  mutate(d1=abs(`Eastern Africa`-`Southern Africa`),
        d2=abs(`Eastern Africa`-`Western Africa`),
        d3=abs(`Eastern Africa`-`Southern Asia`),
        d4=abs(`Southern Africa`-`Southern Asia`),
        d5=abs(`Southern Africa`-`Western Africa`),
        d6=abs(`Western Africa`-`Southern Asia`)) %>% 
  filter(d1>p | d2>p | d3>p | d4>p | d5>p | d6>p)

Fig S3 - overlapping regional densities - log2 scale

kbayes$region_post %>% 
    left_join(kbayes$locus_rank) %>%
    filter(rank<=30) %>%
    mutate(Region=fct_relevel(subgroup,c("Western Africa", "Southern Africa", "Eastern Africa", "Southern Asia"))) %>%
    ggplot() +
    geom_hline(yintercept=11) +
    geom_hline(yintercept=21) +
    geom_density_ridges(aes(x=log2(prob), y=locus, fill=Region), 
                        scale=1, rel_min_height = 0.01, alpha=0.5, col=NA) + 
    scale_y_discrete(limits=rev(kbayes$locus_rank$locus[1:30])) + 
  scale_x_continuous(limits=c(-13,0)) +
  labs(x="log2(prevalence) per region", y="", fill="Region") + 
  annotate(y=region_2pc$locus, label="*", x=-1, geom="text") + 
  annotate(y=logistic_sig$locus, label="^", x=-0.5, geom="text") +
  scale_fill_manual(values=region_cols) +
  theme_bw()
## Joining with `by = join_by(locus)`
## Picking joint bandwidth of 0.123
## Warning: Removed 2636 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_text()`).

ggsave("figures/FigS3_RegionalKBayesDist.pdf", width=6, height=9)
## Picking joint bandwidth of 0.123
## Warning: Removed 2636 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_text()`).
ggsave("figures/FigS3_RegionalKBayesDist.png", width=6, height=9)
## Picking joint bandwidth of 0.123
## Warning: Removed 2636 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_text()`).

O locus analysis

read posterior estimates for adjusted counts (main) and raw counts (for comparison)

# read adjusted Bayesian estimates for O
obayes <- parseModelledEstimates(global_post ="outputs_core/O_Full_ALL_adj_28_posterior_global.csv.gz", region_post = "outputs_core/O_Full_ALL_adj_28_posterior_subgroup.csv.gz", fixOnames = T)
## Rows: 180000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 720000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
# read raw Bayesian estimates for O
obayes_raw <- parseModelledEstimates(global_post ="outputs_core/O_Full_ALL_raw_28_posterior_global.csv.gz", region_post = "outputs_core/O_Full_ALL_raw_28_posterior_subgroup.csv.gz", fixOnames = T)
## Rows: 180000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 720000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.

global and regional prevalence

o_prev_global_dist <- locus_ridgesplot(posterior=obayes$global_post, 
                                   ranks=obayes$locus_rank, 
                                   lines_every=5, maxRank=15, 
                                   xlim=c(0,30), xbreaks=seq(0,30,5), title="a) Global estimates")
## Joining with `by = join_by(locus)`
## Picking joint bandwidth of 0.0901
## Warning: Removed 257 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

o_prev_region_heatmap <- region_heatmap(estimates=obayes$region_est, global=obayes$global_est,
                                   ranks=obayes$locus_rank, maxRank=15, median=F, title="b) Point estimates")

# fig 4a
o_prev_global_dist + o_prev_region_heatmap
## Picking joint bandwidth of 0.0901
## Warning: Removed 257 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

compare global prevalence distributions using cluster-adjusted and raw counts

# plot raw vs adjusted distributions, overlaid - for appendix s2.4b
o_dist_raw_adj <- comparative_locus_ridgesplot(posterior1=obayes$global_post,
                             posterior2=obayes_raw$global_post,
                             ranks=obayes$locus_rank, maxRank=10,
                             lines_every=5, xlim=c(0,30), xbreaks=seq(0,30,5))
## Joining with `by = join_by(locus)`
## Picking joint bandwidth of 0.103
## Warning: Removed 257 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

logistic regression of O vs year + region

oresult <- c()
for (o in obayes$locus_rank$locus[1:10]) {
  d <- data_NNS_sites10 %>%
    mutate(o=if_else(O_genotype==o, 1, 0)) %>%
    select(o, Year, Region)
  m <- summary(logistf(o~Year+Region, data=d))
  oresult <- rbind(oresult, cbind(coef=m$coefficients, p=m$prob, locus=o))
}
## logistf(formula = o ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef   se(coef)    lower 0.95  upper 0.95
## (Intercept)           -7.915304840 47.1640372 -100.40042475 84.68212427
## Year                   0.003453048  0.0233770   -0.04244445  0.04929231
## RegionSouthern Africa -0.005521896  0.1660590   -0.33526304  0.31653647
## RegionSouthern Asia   -0.493627573  0.1396298   -0.77089234 -0.22290635
## RegionWestern Africa   0.302361411  0.2491794   -0.19872272  0.78219635
##                             Chisq            p method
## (Intercept)            0.02812636 0.8668120895      2
## Year                   0.02178871 0.8826505754      2
## RegionSouthern Africa  0.00110587 0.9734715390      2
## RegionSouthern Asia   12.98331748 0.0003142786      2
## RegionWestern Africa   1.42967689 0.2318171782      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=17.81873 on 4 df, p=0.001338928, n=1930
## Wald test = 411.9142 on 4 df, p = 0
## Warning in logistf(o ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(o ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year exceeded.
## Try to increase the number of iterations by passing
## 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = o ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef    se(coef)   lower 0.95  upper 0.95
## (Intercept)           -130.00000000 43.66106855 -630.0000000 370.0000000
## Year                     0.06410768  0.02163974   -0.1840446   0.3117062
## RegionSouthern Africa   -0.09593226  0.15526698   -4.1624178   2.5154891
## RegionSouthern Asia     -0.29052407  0.12334809   -4.4830882   1.5833615
## RegionWestern Africa     0.04536010  0.24576571   -8.8036957   4.4799611
##                          Chisq         p method
## (Intercept)           0.000000 1.0000000      2
## Year                  0.000000 1.0000000      2
## RegionSouthern Africa 0.000000 1.0000000      2
## RegionSouthern Asia   0.000000 1.0000000      2
## RegionWestern Africa  1.165171 0.2803956      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-198.3195 on 4 df, p=1, n=1930
## Wald test = 214.3342 on 4 df, p = 0
## Warning in logistf(o ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(o ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa, RegionSouthern Asia, RegionWestern Africa exceeded. Try
## to increase the number of iterations by passing 'logistpl.control(maxit=...)'
## to parameter plcontrol
## logistf(formula = o ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                               coef    se(coef)   lower 0.95  upper 0.95
## (Intercept)           130.00000000 42.06475319 -370.0000000 630.0000000
## Year                   -0.06455622  0.02085008   -0.3125101   0.1830565
## RegionSouthern Africa  -0.09161390  0.15240643   -5.9170481   1.9782353
## RegionSouthern Asia    -0.23127866  0.11829966   -6.1353338   1.1683005
## RegionWestern Africa   -0.21746002  0.24266285  -16.5117316   4.3234941
##                           Chisq            p method
## (Intercept)            0.000000 1.000000e+00      2
## Year                   0.000000 1.000000e+00      2
## RegionSouthern Africa 19.733654 8.901981e-06      2
## RegionSouthern Asia         Inf 0.000000e+00      2
## RegionWestern Africa   7.759546 5.342938e-03      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-130.2013 on 4 df, p=1, n=1930
## Wald test = 83.12975 on 4 df, p = 0logistf(formula = o ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                              coef    se(coef)   lower 0.95   upper 0.95
## (Intercept)           45.40848091 64.25871853 -80.83585300 171.99995108
## Year                  -0.02380163  0.03185194  -0.08655544   0.03877092
## RegionSouthern Africa  0.85859872  0.23123433   0.39683133   1.30631592
## RegionSouthern Asia    1.73072584  0.16700314   1.40641665   2.06248351
## RegionWestern Africa  -0.76890230  0.65339403  -2.35361206   0.32017357
##                            Chisq            p method
## (Intercept)            0.4969499 0.4808433918      2
## Year                   0.5557499 0.4559777730      2
## RegionSouthern Africa 12.7880943 0.0003488323      2
## RegionSouthern Asia          Inf 0.0000000000      2
## RegionWestern Africa   1.7379098 0.1874038869      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=134.7828 on 4 df, p=0, n=1930
## Wald test = 697.314 on 4 df, p = 0
## Warning in logistf(o ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(o ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Asia exceeded. Try to increase the number of iterations by
## passing 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = o ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef    se(coef)  lower 0.95  upper 0.95
## (Intercept)           -130.00000000 56.86737146 -630.000000 370.0000000
## Year                     0.06356031  0.02818327   -0.184581   0.3110891
## RegionSouthern Africa   -0.36240248  0.21810706  -24.601079   3.2446957
## RegionSouthern Asia     -0.03180749  0.15697475   -4.386919   3.3576880
## RegionWestern Africa     0.15907331  0.31044666  -15.940080   5.1151587
##                          Chisq          p method
## (Intercept)           0.000000 1.00000000      2
## Year                  0.000000 1.00000000      2
## RegionSouthern Africa 0.000000 1.00000000      2
## RegionSouthern Asia   0.000000 1.00000000      2
## RegionWestern Africa  3.959427 0.04660958      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-163.4521 on 4 df, p=1, n=1930
## Wald test = 764.3549 on 4 df, p = 0logistf(formula = o ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                              coef     se(coef)   lower 0.95  upper 0.95
## (Intercept)           89.41031753 122.38104962 -154.4304391 333.9806243
## Year                  -0.04619848   0.06066604   -0.1674539   0.0746582
## RegionSouthern Africa  0.13047017   0.49889656   -0.9492056   1.0502107
## RegionSouthern Asia    0.80257902   0.32102979    0.1628516   1.4368821
## RegionWestern Africa   1.04434411   0.52649863   -0.1271791   1.9920392
##                            Chisq          p method
## (Intercept)           0.51663877 0.47227902      2
## Year                  0.56138425 0.45370305      2
## RegionSouthern Africa 0.06645363 0.79657206      2
## RegionSouthern Asia   6.00001856 0.01430573      2
## RegionWestern Africa  3.13651422 0.07655726      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=8.283469 on 4 df, p=0.08172918, n=1930
## Wald test = 652.0744 on 4 df, p = 0
## Warning in logistf(o ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(o ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa exceeded. Try to increase the number of iterations by
## passing 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = o ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                              coef    se(coef)   lower 0.95  upper 0.95
## (Intercept)           130.0000000 97.68690313 -370.0000000 630.0000000
## Year                   -0.0660204  0.04842661   -0.3141732   0.1814545
## RegionSouthern Africa   1.6455483  0.26211084    0.7818739   4.1798852
## RegionSouthern Asia     0.2496183  0.27781972   -1.6554204   2.3170071
## RegionWestern Africa   -0.3623285  0.70030246  -26.9647063   2.7204386
##                           Chisq            p method
## (Intercept)            0.000000 1.000000e+00      2
## Year                   0.000000 1.000000e+00      2
## RegionSouthern Africa 35.705987 2.294578e-09      2
## RegionSouthern Asia    0.000000 1.000000e+00      2
## RegionWestern Africa   1.851984 1.735528e-01      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=42.18109 on 4 df, p=1.530031e-08, n=1930
## Wald test = 771.5683 on 4 df, p = 0
## Warning in logistf(o ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(o ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa, RegionSouthern Asia exceeded. Try to increase the number
## of iterations by passing 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = o ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef    se(coef)   lower 0.95  upper 0.95
## (Intercept)           -130.00000000 63.88852738 -630.0000000 370.0000000
## Year                     0.06331655  0.03166235   -0.1848777   0.3107992
## RegionSouthern Africa   -0.41351104  0.26628667  -47.3208673   3.5797213
## RegionSouthern Asia      0.54091770  0.16604418   -1.4644141   4.8213351
## RegionWestern Africa     0.33890169  0.34736667  -12.7171776   5.6005327
##                          Chisq           p method
## (Intercept)           0.000000 1.000000000      2
## Year                  0.000000 1.000000000      2
## RegionSouthern Africa 0.000000 1.000000000      2
## RegionSouthern Asia        Inf 0.000000000      2
## RegionWestern Africa  6.844984 0.008889051      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-53.35984 on 4 df, p=1, n=1930
## Wald test = 826.8288 on 4 df, p = 0
## Warning in logistf(o ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(o ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Asia exceeded. Try to increase the number of iterations by
## passing 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = o ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef     se(coef)   lower 0.95  upper 0.95
## (Intercept)           -130.00000000 100.71414226 -630.0000000 370.0000000
## Year                     0.06272069   0.04991153   -0.1855357   0.3101549
## RegionSouthern Africa    0.37827946   0.34942716   -4.1754673   7.5149182
## RegionSouthern Asia      0.69279079   0.26254376   -1.0930093   7.8427191
## RegionWestern Africa     0.80477518   0.47878684   -6.5785507   8.5314061
##                           Chisq            p method
## (Intercept)            0.000000 1.000000e+00      2
## Year                   0.000000 1.000000e+00      2
## RegionSouthern Africa 27.152921 1.879809e-07      2
## RegionSouthern Asia         Inf 0.000000e+00      2
## RegionWestern Africa   9.053097 2.622502e-03      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-12.67631 on 4 df, p=1, n=1930
## Wald test = 770.6723 on 4 df, p = 0
## Warning in logistf(o ~ Year + Region, data = d): logistf.fit: Maximum number of
## iterations for full model exceeded. Try to increase the number of iterations or
## alter step size by passing 'logistf.control(maxit=..., maxstep=...)' to
## parameter control
## Warning in logistf(o ~ Year + Region, data = d): Nonconverged PL confidence
## limits: maximum number of iterations for variables: (Intercept), Year,
## RegionSouthern Africa, RegionSouthern Asia exceeded. Try to increase the number
## of iterations by passing 'logistpl.control(maxit=...)' to parameter plcontrol
## logistf(formula = o ~ Year + Region, data = d)
## 
## Model fitted by Penalized ML
## Coefficients:
##                                coef    se(coef)   lower 0.95  upper 0.95
## (Intercept)           -130.00000000 241.7152036 -630.0000000 370.0000000
## Year                     0.06189060   0.1197875   -0.1862222   0.3094036
## RegionSouthern Africa   -0.05452735   0.9079278  -13.2421907   5.8485309
## RegionSouthern Asia      0.12435699   0.6771676   -4.6149708   5.8730352
## RegionWestern Africa     1.36551196   0.8421475   -2.8098983   7.9488429
##                          Chisq          p method
## (Intercept)           0.000000 1.00000000      2
## Year                  0.000000 1.00000000      2
## RegionSouthern Africa 0.000000 1.00000000      2
## RegionSouthern Asia   2.109106 0.14642487      2
## RegionWestern Africa  5.213434 0.02241303      2
## 
## Method: 1-Wald, 2-Profile penalized log-likelihood, 3-None
## 
## Likelihood ratio test=-2.843686 on 4 df, p=1, n=1930
## Wald test = 335.8981 on 4 df, p = 0
logistic_sig_o <- as_tibble(oresult, rownames="variable") %>% filter(p<0.05) %>% left_join(obayes$locus_rank)
## Joining with `by = join_by(locus)`
p<-0.05

region_5pc_o <- obayes$region_est %>% select(locus, subgroup, mean) %>% 
  pivot_wider(id_cols=locus, names_from=subgroup, values_from = mean) %>% 
  mutate(d1=abs(`Eastern Africa`-`Southern Africa`),
        d2=abs(`Eastern Africa`-`Western Africa`),
        d3=abs(`Eastern Africa`-`Southern Asia`),
        d4=abs(`Southern Africa`-`Southern Asia`),
        d5=abs(`Southern Africa`-`Western Africa`),
        d6=abs(`Western Africa`-`Southern Asia`)) %>% 
  filter(d1>p | d2>p | d3>p | d4>p | d5>p | d6>p)

Fig S5 - overlapping regional densities for O types - log2 scale

obayes$region_post %>% 
    left_join(obayes$locus_rank) %>%
    filter(rank<=10) %>%
    mutate(Region=fct_relevel(subgroup,c("Western Africa", "Southern Africa", "Eastern Africa", "Southern Asia"))) %>%
    ggplot() +
    geom_hline(yintercept=11) +
    geom_hline(yintercept=21) +
    geom_density_ridges(aes(x=log2(prob), y=locus, fill=Region), 
                        scale=1, rel_min_height = 0.01, alpha=0.5, col=NA) + 
    scale_y_discrete(limits=rev(obayes$locus_rank$locus[1:10])) + 
  scale_x_continuous(limits=c(-13,0)) +
  labs(x="log2(prevalence) per region", y="", fill="Region") + 
  annotate(y=region_5pc_o$locus, label="*", x=-1, geom="text") + 
  annotate(y=logistic_sig_o$locus, label="^", x=-0.5, geom="text") +
  scale_fill_manual(values=region_cols) +
  theme_bw()
## Joining with `by = join_by(locus)`
## Picking joint bandwidth of 0.0645
## Warning: Removed 54 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

ggsave("figures/FigS5_RegionalOBayesDist.pdf", width=6, height=5, device = cairo_pdf, family = "Arial Unicode MS")
## Picking joint bandwidth of 0.0645
## Warning: Removed 54 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
ggsave("figures/FigS5_RegionalOBayesDist.png", width=6, height=5)
## Picking joint bandwidth of 0.0645
## Warning: Removed 54 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

Appendix Fig S1.3 = crude annual K rate per region

K_crude_perRegion_perYear <- data_NNS_sites10 %>%
  raw_adj_prop(grouping_vars = c("K_locus", "Region", "Year"), summarise_by = "Region", adj_vars = c("Cluster", "Region", "Year")) 
## grouping_var: Region in adj_vars, removing from adj_vars
## grouping_var: Year in adj_vars, removing from adj_vars
## Grouping vars: K_locus, Region, Year
## Summarising by: Region
## Adj vars: Cluster
K_crude_perRegion_perYear <- K_crude_perRegion_perYear %>% group_by(Region, Year) %>% 
  summarise(n=sum(adj_count)) %>% left_join(K_crude_perRegion_perYear) %>%
  mutate(adj_prop = adj_count/n) %>% select(-raw_sum, -adj_sum, -raw_prop, -raw_count)
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
## Joining with `by = join_by(Region, Year)`
K_crude_perRegion_perYear %>% 
    left_join(kbayes$locus_rank %>% rename(K_locus=locus)) %>% 
    filter(rank <=30 | K_locus %in% c("KL116", "KL53", "KL81")) %>%
    ggplot(aes(x=Year, fill=Region, y=adj_prop)) + 
    geom_col() +
    theme_bw() + 
    theme(axis.text.x = element_text(angle=45, hjust=1, size=8), 
          axis.text.y=element_text(size=10)) + 
  facet_wrap(~rank+K_locus, scales="free_y") +
    theme(legend.position="bottom") +
  scale_fill_manual(values=region_cols)
## Joining with `by = join_by(K_locus)`

ggsave("figures/AppendixFigS1.3_crudeRegionAnnualRateK.pdf", width=7, height=9)
ggsave("figures/AppendixFigS1.3_crudeRegionAnnualRateK.png", width=7, height=9)

Appendix Fig S1.4 - simple est per year per O/region

O_crude_perRegion_perYear <- data_NNS_sites10 %>%
  raw_adj_prop(grouping_vars = c("O_genotype", "Region", "Year"), summarise_by = "Region", adj_vars = c("Cluster", "Region", "Year")) 
## grouping_var: Region in adj_vars, removing from adj_vars
## grouping_var: Year in adj_vars, removing from adj_vars
## Grouping vars: O_genotype, Region, Year
## Summarising by: Region
## Adj vars: Cluster
O_crude_perRegion_perYear <- O_crude_perRegion_perYear %>% group_by(Region, Year) %>% 
  summarise(n=sum(adj_count)) %>% left_join(O_crude_perRegion_perYear) %>%
  mutate(adj_prop = adj_count/n) %>% select(-raw_sum, -adj_sum, -raw_prop, -raw_count)
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
## Joining with `by = join_by(Region, Year)`
O_crude_perRegion_perYear %>% 
    left_join(obayes$locus_rank %>% rename(O_genotype=locus)) %>% 
    #filter(rank <=12) %>%
    ggplot(aes(x=Year, fill=Region, y=adj_prop)) + 
    geom_col() +
    theme_bw() + 
    theme(axis.text.x = element_text(angle=45, hjust=1, size=10), 
          axis.text.y=element_text(size=10)) + 
  facet_wrap(~rank+O_genotype, scales="free_y", ncol=3) +
    theme(legend.position="bottom") + 
  scale_fill_manual(values=region_cols) + labs(fill="")
## Joining with `by = join_by(O_genotype)`

ggsave("figures/AppendixFigS1.4_crudeRegionAnnualRateO.pdf", width=6, height=8, device = cairo_pdf, family = "Arial Unicode MS")
ggsave("figures/AppendixFigS1.4_crudeRegionAnnualRateO.png", width=6, height=8)

Appendix Figure S1.5 - ST vs KL coloured by region, for top 4 O types

o_region_ST <- data_NNS_sites10 %>%
  filter(O_genotype %in% c("O2β", "O4", "O2⍺", "O13")) %>%
  mutate(Region=fct_relevel(Region, c("Western Africa", "Southern Africa", "Eastern Africa", "Southern Asia"))) %>%
  ggplot(aes(col=Region, x=K_locus, y=ST)) + 
  geom_jitter(alpha=0.5) +
  theme_bw() + 
  theme(axis.text.x = element_text(angle=45, hjust=1, size=10), 
        axis.text.y=element_text(size=9), legend.position="bottom") +
  scale_colour_manual(values=region_cols) + 
  facet_wrap(~O_genotype, scales="free") + labs(col="", y="", x="")
o_region_ST

ggsave("figures/AppendixFigS1.5_O_STvsKbyRegion.pdf", width=7, height=9, device = cairo_pdf, family = "Arial Unicode MS")
ggsave("figures/AppendixFigS1.5_O_STvsKbyRegion.png", width=7, height=9)

effect of clustering - modelled estimates

kbayes_global <- full_join(kbayes$global_est, kbayes_raw$global_est, by="locus", suffix=c(".adj", ".raw")) 

bayes_global_K_raw_vs_adj <- kbayes_global %>%
  ggplot(aes(x=mean.raw*100, y=mean.adj*100)) + 
  geom_point() + theme_bw() + 
  labs(x="Bayesian estimate (%), raw", y="Bayesian estimate (%), cluster-adjusted") +
  geom_abline(slope=1, intercept=0) + 
  geom_text(data=kbayes_global[kbayes_global$locus %in% kbayes$locus_rank$locus[1:5],], aes(x=mean.raw*100, y=mean.adj*100, label=locus), hjust=1, nudge_x=-0.2, size=2.5)
obayes_global <- full_join(obayes$global_est, obayes_raw$global_est, by="locus", suffix=c(".adj", ".raw")) 

bayes_global_O_raw_vs_adj <- obayes_global %>%
  ggplot(aes(x=mean.raw*100, y=mean.adj*100)) + 
  geom_point() + theme_bw() + 
  labs(x="Bayesian estimate (%), raw", y="Bayesian estimate (%), cluster-adjusted") +
  geom_abline(slope=1, intercept=0) + 
  geom_text(data=obayes_global[obayes_global$locus %in% obayes$locus_rank$locus[1:9],], aes(x=mean.raw*100, y=mean.adj*100, label=locus), hjust=1, nudge_x=-0.2, size=2.5)

Appendix Figure S2.4 - cluster vs raw, global K

( (bayes_global_K_raw_vs_adj + ggtitle("a) K locus (mean estimate)") ) + (bayes_global_O_raw_vs_adj  + ggtitle("b) O type (mean estimate)")) + patchwork::plot_layout(axes="collect") ) / (k_dist_raw_adj + ggtitle ("c) K locus (posterior distribution)") + o_dist_raw_adj + ggtitle ("d) O type (posterior distribution)") + patchwork::plot_layout(guides="collect")) + patchwork::plot_layout(heights=c(1,2)) & theme(legend.position='bottom')
## Picking joint bandwidth of 0.0586
## Warning: Removed 58 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
## Picking joint bandwidth of 0.103
## Warning: Removed 257 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

ggsave("figures/AppendixFigS2.4_clusterVsRaw.pdf", width=8, height=11, device = cairo_pdf, family = "Arial Unicode MS")
## Picking joint bandwidth of 0.0586
## Warning: Removed 58 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
## Picking joint bandwidth of 0.103
## Warning: Removed 257 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
ggsave("figures/AppendixFigS2.4_clusterVsRaw.png", width=8, height=11)
## Picking joint bandwidth of 0.0586
## Warning: Removed 58 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
## Picking joint bandwidth of 0.103
## Warning: Removed 257 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

Effect of cluster adjustment on crude proportion per site

K_crude_perSite <- data_NNS_sites10 %>%
  raw_adj_prop(grouping_vars = c("K_locus", "Site"), summarise_by = "Site", adj_vars = c("Cluster", "Site")) %>%
  mutate(raw_prop = raw_count/raw_sum) %>%
  mutate(adj_prop = adj_count/adj_sum)
## grouping_var: Site in adj_vars, removing from adj_vars
## Grouping vars: K_locus, Site
## Summarising by: Site
## Adj vars: Cluster
O_crude_perSite <- data_NNS_sites10 %>%
  raw_adj_prop(grouping_vars = c("O_genotype", "Site"), summarise_by = "Site", adj_vars = c("Cluster", "Site")) %>%
  mutate(raw_prop = raw_count/raw_sum) %>%
  mutate(adj_prop = adj_count/adj_sum)
## grouping_var: Site in adj_vars, removing from adj_vars
## Grouping vars: O_genotype, Site
## Summarising by: Site
## Adj vars: Cluster
# KL25
K_crude_raw_adj_scatter_KL25 <- K_crude_perSite %>% 
  filter(K_locus=="KL25") %>%
  ggplot(aes(x=raw_prop, y=adj_prop)) +
  geom_point() + 
  geom_abline(slope=1, intercept=0) +
  theme_bw() +
  labs(x="Crude proportion, raw", y="Crude proportion, adjusted")

# KL2
K_crude_raw_adj_scatter_KL2 <- K_crude_perSite %>% 
  filter(K_locus=="KL2") %>%
  ggplot(aes(x=raw_prop, y=adj_prop)) +
  geom_point() + 
  geom_abline(slope=1, intercept=0) +
  theme_bw() +
  labs(x="Crude proportion, raw", y="Crude proportion, adjusted")

# KL102
K_crude_raw_adj_scatter_KL102 <- K_crude_perSite %>% 
  filter(K_locus=="KL102") %>%
  ggplot(aes(x=raw_prop, y=adj_prop)) +
  geom_point() + 
  geom_abline(slope=1, intercept=0) +
  theme_bw() +
  labs(x="Crude proportion, raw", y="Crude proportion, adjusted")

# KL62
K_crude_raw_adj_scatter_KL62 <- K_crude_perSite %>% 
  filter(K_locus=="KL62") %>%
  ggplot(aes(x=raw_prop, y=adj_prop)) +
  geom_point() + 
  geom_abline(slope=1, intercept=0) +
  theme_bw() +
  labs(x="Crude proportion, raw", y="Crude proportion, adjusted")

# O1v1
O_crude_raw_adj_scatter_O1v1 <- O_crude_perSite %>% 
  filter(O_genotype=="O1⍺β,2⍺") %>%
  ggplot(aes(x=raw_prop, y=adj_prop)) +
  geom_point() + 
  geom_abline(slope=1, intercept=0) +
  theme_bw() +
  labs(x="Crude proportion, raw", y="Crude proportion, adjusted")

# O1v2
O_crude_raw_adj_scatter_O1v2 <- O_crude_perSite %>% 
  filter(O_genotype=="O1⍺β,2β") %>%
  ggplot(aes(x=raw_prop, y=adj_prop)) +
  geom_point() + 
  geom_abline(slope=1, intercept=0) +
  theme_bw() +
  labs(x="Crude proportion, raw", y="Crude proportion, adjusted")

# O2afg
O_crude_raw_adj_scatter_O2afg <- O_crude_perSite %>% 
  filter(O_genotype=="O2β") %>%
  ggplot(aes(x=raw_prop, y=adj_prop)) +
  geom_point() + 
  geom_abline(slope=1, intercept=0) +
  theme_bw() +
  labs(x="Crude proportion, raw", y="Crude proportion, adjusted")

# O4
O_crude_raw_adj_scatter_O4 <- O_crude_perSite %>% 
  filter(O_genotype=="O4") %>%
  ggplot(aes(x=raw_prop, y=adj_prop)) +
  geom_point() + 
  geom_abline(slope=1, intercept=0) +
  theme_bw() +
  labs(x="Crude proportion, raw", y="Crude proportion, adjusted")

Appendix Fig S2.3 - Effect of cluster adjustment on crude proportion

(K_crude_raw_adj_scatter_KL102 + ggtitle("a) KL102, per-site") + K_crude_raw_adj_scatter_KL2 + ggtitle("b) KL2, per-site")) / (K_crude_raw_adj_scatter_KL25 + ggtitle("c) KL25, per-site") + K_crude_raw_adj_scatter_KL62 + ggtitle("d) KL62, per-site")) / (O_crude_raw_adj_scatter_O1v1 + ggtitle("e) O1⍺β,2⍺, per-site") + O_crude_raw_adj_scatter_O1v2 + ggtitle("f) O1⍺β,2β, per-site")) / (O_crude_raw_adj_scatter_O2afg + ggtitle("g) O2β, per-site") + O_crude_raw_adj_scatter_O4 + ggtitle("h) O4, per-site"))

ggsave("figures/AppendixFigS2.3_EffectClusteringCrude.pdf", width=8, height=10, device = cairo_pdf, family = "Arial Unicode MS")
ggsave("figures/AppendixFigS2.3_EffectClusteringCrude.png", width=8, height=10)

numbers for text - K prevalence estimates

kbayes$global_est
## # A tibble: 90 × 6
##    locus median   mean  lower  upper  rank
##    <chr>  <dbl>  <dbl>  <dbl>  <dbl> <int>
##  1 KL2   0.0895 0.0898 0.0716 0.110      1
##  2 KL102 0.0872 0.0875 0.0696 0.107      2
##  3 KL25  0.0823 0.0825 0.0646 0.102      3
##  4 KL15  0.0576 0.0579 0.0433 0.0745     4
##  5 KL62  0.0451 0.0455 0.0323 0.0604     5
##  6 KL30  0.0341 0.0345 0.0230 0.0479     6
##  7 KL10  0.0266 0.0270 0.0169 0.0392     7
##  8 KL17  0.0255 0.0259 0.0161 0.0380     8
##  9 KL23  0.0242 0.0246 0.0151 0.0366     9
## 10 KL51  0.0229 0.0233 0.0144 0.0346    10
## # ℹ 80 more rows
kbayes$global_est %>% filter(rank<=5)
## # A tibble: 5 × 6
##   locus median   mean  lower  upper  rank
##   <chr>  <dbl>  <dbl>  <dbl>  <dbl> <int>
## 1 KL2   0.0895 0.0898 0.0716 0.110      1
## 2 KL102 0.0872 0.0875 0.0696 0.107      2
## 3 KL25  0.0823 0.0825 0.0646 0.102      3
## 4 KL15  0.0576 0.0579 0.0433 0.0745     4
## 5 KL62  0.0451 0.0455 0.0323 0.0604     5
kbayes$global_est %>% filter(median>0.02)
## # A tibble: 12 × 6
##    locus median   mean  lower  upper  rank
##    <chr>  <dbl>  <dbl>  <dbl>  <dbl> <int>
##  1 KL2   0.0895 0.0898 0.0716 0.110      1
##  2 KL102 0.0872 0.0875 0.0696 0.107      2
##  3 KL25  0.0823 0.0825 0.0646 0.102      3
##  4 KL15  0.0576 0.0579 0.0433 0.0745     4
##  5 KL62  0.0451 0.0455 0.0323 0.0604     5
##  6 KL30  0.0341 0.0345 0.0230 0.0479     6
##  7 KL10  0.0266 0.0270 0.0169 0.0392     7
##  8 KL17  0.0255 0.0259 0.0161 0.0380     8
##  9 KL23  0.0242 0.0246 0.0151 0.0366     9
## 10 KL51  0.0229 0.0233 0.0144 0.0346    10
## 11 KL112 0.0230 0.0233 0.0142 0.0347    11
## 12 KL39  0.0218 0.0222 0.0131 0.0335    12
kbayes$global_est %>% filter(median>0.01)
## # A tibble: 27 × 6
##    locus median   mean  lower  upper  rank
##    <chr>  <dbl>  <dbl>  <dbl>  <dbl> <int>
##  1 KL2   0.0895 0.0898 0.0716 0.110      1
##  2 KL102 0.0872 0.0875 0.0696 0.107      2
##  3 KL25  0.0823 0.0825 0.0646 0.102      3
##  4 KL15  0.0576 0.0579 0.0433 0.0745     4
##  5 KL62  0.0451 0.0455 0.0323 0.0604     5
##  6 KL30  0.0341 0.0345 0.0230 0.0479     6
##  7 KL10  0.0266 0.0270 0.0169 0.0392     7
##  8 KL17  0.0255 0.0259 0.0161 0.0380     8
##  9 KL23  0.0242 0.0246 0.0151 0.0366     9
## 10 KL51  0.0229 0.0233 0.0144 0.0346    10
## # ℹ 17 more rows
kbayes$global_est %>% filter(locus=="KL149")
## # A tibble: 1 × 6
##   locus median   mean  lower  upper  rank
##   <chr>  <dbl>  <dbl>  <dbl>  <dbl> <int>
## 1 KL149 0.0193 0.0197 0.0113 0.0298    13
kbayes$region_est %>% filter(locus=="KL149")
## # A tibble: 4 × 6
## # Groups:   locus [1]
##   locus subgroup         median    mean    lower  upper
##   <chr> <chr>             <dbl>   <dbl>    <dbl>  <dbl>
## 1 KL149 Eastern Africa  0.00414 0.00482 0.000794 0.0127
## 2 KL149 Southern Africa 0.0783  0.0802  0.0430   0.127 
## 3 KL149 Southern Asia   0.00583 0.00697 0.00108  0.0191
## 4 KL149 Western Africa  0.00546 0.00840 0.000459 0.0334
kbayes$global_est %>% filter(locus=="KL15")
## # A tibble: 1 × 6
##   locus median   mean  lower  upper  rank
##   <chr>  <dbl>  <dbl>  <dbl>  <dbl> <int>
## 1 KL15  0.0576 0.0579 0.0433 0.0745     4
kbayes$region_est %>% filter(locus=="KL15")
## # A tibble: 4 × 6
## # Groups:   locus [1]
##   locus subgroup         median   mean   lower  upper
##   <chr> <chr>             <dbl>  <dbl>   <dbl>  <dbl>
## 1 KL15  Eastern Africa  0.0580  0.0587 0.0375  0.0843
## 2 KL15  Southern Africa 0.00943 0.0112 0.00186 0.0300
## 3 KL15  Southern Asia   0.0902  0.0913 0.0585  0.130 
## 4 KL15  Western Africa  0.0325  0.0374 0.00728 0.0946
kbayes$global_est %>% filter(locus=="KL51")
## # A tibble: 1 × 6
##   locus median   mean  lower  upper  rank
##   <chr>  <dbl>  <dbl>  <dbl>  <dbl> <int>
## 1 KL51  0.0229 0.0233 0.0144 0.0346    10
kbayes$region_est %>% filter(locus=="KL51")
## # A tibble: 4 × 6
## # Groups:   locus [1]
##   locus subgroup         median    mean    lower  upper
##   <chr> <chr>             <dbl>   <dbl>    <dbl>  <dbl>
## 1 KL51  Eastern Africa  0.00363 0.00433 0.000635 0.0120
## 2 KL51  Southern Africa 0.00229 0.00346 0.000195 0.0135
## 3 KL51  Southern Asia   0.0682  0.0698  0.0415   0.106 
## 4 KL51  Western Africa  0.00418 0.00680 0.000287 0.0279
kbayes$global_est %>% filter(locus=="KL81")
## # A tibble: 1 × 6
##   locus median   mean   lower  upper  rank
##   <chr>  <dbl>  <dbl>   <dbl>  <dbl> <int>
## 1 KL81  0.0119 0.0123 0.00584 0.0209    26
kbayes$region_est %>% filter(locus=="KL81")
## # A tibble: 4 × 6
## # Groups:   locus [1]
##   locus subgroup         median    mean    lower   upper
##   <chr> <chr>             <dbl>   <dbl>    <dbl>   <dbl>
## 1 KL81  Eastern Africa  0.00156 0.00217 0.000153 0.00742
## 2 KL81  Southern Africa 0.00152 0.00251 0.000104 0.0108 
## 3 KL81  Southern Asia   0.0350  0.0362  0.0163   0.0630 
## 4 KL81  Western Africa  0.00262 0.00480 0.000155 0.0222
kbayes$global_est %>% filter(locus=="KL28")
## # A tibble: 1 × 6
##   locus median   mean   lower  upper  rank
##   <chr>  <dbl>  <dbl>   <dbl>  <dbl> <int>
## 1 KL28  0.0131 0.0135 0.00679 0.0223    22
kbayes$region_est %>% filter(locus=="KL28")
## # A tibble: 4 × 6
## # Groups:   locus [1]
##   locus subgroup         median    mean    lower  upper
##   <chr> <chr>             <dbl>   <dbl>    <dbl>  <dbl>
## 1 KL28  Eastern Africa  0.0127  0.0135  0.00481  0.0269
## 2 KL28  Southern Africa 0.00616 0.00782 0.000906 0.0242
## 3 KL28  Southern Asia   0.00367 0.00463 0.000512 0.0145
## 4 KL28  Western Africa  0.0618  0.0669  0.0201   0.140
kbayes$global_est %>% filter(locus=="KL116")
## # A tibble: 1 × 6
##   locus  median    mean   lower  upper  rank
##   <chr>   <dbl>   <dbl>   <dbl>  <dbl> <int>
## 1 KL116 0.00579 0.00615 0.00199 0.0124    36
kbayes$region_est %>% filter(locus=="KL116")
## # A tibble: 4 × 6
## # Groups:   locus [1]
##   locus subgroup         median    mean    lower   upper
##   <chr> <chr>             <dbl>   <dbl>    <dbl>   <dbl>
## 1 KL116 Eastern Africa  0.00159 0.00218 0.000158 0.00752
## 2 KL116 Southern Africa 0.00158 0.00261 0.000108 0.0110 
## 3 KL116 Southern Asia   0.00212 0.00301 0.000183 0.0109 
## 4 KL116 Western Africa  0.0511  0.0561  0.0135   0.129
kbayes$global_est %>% filter(locus=="KL53")
## # A tibble: 1 × 6
##   locus  median    mean   lower  upper  rank
##   <chr>   <dbl>   <dbl>   <dbl>  <dbl> <int>
## 1 KL53  0.00448 0.00487 0.00132 0.0106    48
kbayes$region_est %>% filter(locus=="KL53")
## # A tibble: 4 × 6
## # Groups:   locus [1]
##   locus subgroup         median    mean     lower   upper
##   <chr> <chr>             <dbl>   <dbl>     <dbl>   <dbl>
## 1 KL53  Eastern Africa  0.00144 0.00201 0.000140  0.00727
## 2 KL53  Southern Africa 0.00137 0.00228 0.0000881 0.0101 
## 3 KL53  Southern Asia   0.00192 0.00276 0.000154  0.0103 
## 4 KL53  Western Africa  0.0352  0.0404  0.00747   0.100

numbers for text - O antigen

obayes$global_est
## # A tibble: 15 × 6
##    locus      median    mean     lower   upper  rank
##    <chr>       <dbl>   <dbl>     <dbl>   <dbl> <int>
##  1 O1⍺β,2⍺  0.267    0.267   0.236     0.299       1
##  2 O1⍺β,2β  0.218    0.218   0.191     0.248       2
##  3 O2β      0.158    0.158   0.135     0.184       3
##  4 O4       0.0905   0.0908  0.0720    0.111       4
##  5 O2⍺      0.0611   0.0615  0.0458    0.0794      5
##  6 O3𝛾      0.0597   0.0601  0.0449    0.0775      6
##  7 O5       0.0535   0.0539  0.0394    0.0702      7
##  8 O13      0.0487   0.0490  0.0352    0.0647      8
##  9 O3⍺/O3β  0.0279   0.0283  0.0181    0.0405      9
## 10 O15      0.00329  0.00369 0.000744  0.00893    10
## 11 O12      0.00209  0.00248 0.000311  0.00683    11
## 12 O10      0.00206  0.00246 0.000282  0.00691    12
## 13 O14      0.000853 0.00123 0.0000318 0.00442    13
## 14 O2⍺𝛾     0.000844 0.00122 0.0000334 0.00439    14
## 15 O1⍺β,2⍺𝛾 0.000843 0.00121 0.0000312 0.00446    15
obayes$global_est %>% filter(rank<=5) %>% pull(median) %>% sum()
## [1] 0.7947118
obayes$global_est %>% filter(rank<=5) %>% pull(lower) %>% sum()
## [1] 0.6802535
obayes$global_est %>% filter(rank<=5) %>% pull(upper) %>% sum()
## [1] 0.9214287
obayes$global_est
## # A tibble: 15 × 6
##    locus      median    mean     lower   upper  rank
##    <chr>       <dbl>   <dbl>     <dbl>   <dbl> <int>
##  1 O1⍺β,2⍺  0.267    0.267   0.236     0.299       1
##  2 O1⍺β,2β  0.218    0.218   0.191     0.248       2
##  3 O2β      0.158    0.158   0.135     0.184       3
##  4 O4       0.0905   0.0908  0.0720    0.111       4
##  5 O2⍺      0.0611   0.0615  0.0458    0.0794      5
##  6 O3𝛾      0.0597   0.0601  0.0449    0.0775      6
##  7 O5       0.0535   0.0539  0.0394    0.0702      7
##  8 O13      0.0487   0.0490  0.0352    0.0647      8
##  9 O3⍺/O3β  0.0279   0.0283  0.0181    0.0405      9
## 10 O15      0.00329  0.00369 0.000744  0.00893    10
## 11 O12      0.00209  0.00248 0.000311  0.00683    11
## 12 O10      0.00206  0.00246 0.000282  0.00691    12
## 13 O14      0.000853 0.00123 0.0000318 0.00442    13
## 14 O2⍺𝛾     0.000844 0.00122 0.0000334 0.00439    14
## 15 O1⍺β,2⍺𝛾 0.000843 0.00121 0.0000312 0.00446    15

numbers for text - O differences by region

# where is O4 found
data_NNS_sites10 %>% filter(O_genotype=="O4") %>% group_by(Region, Study, Site, ST, K_locus) %>% count() %>% arrange(-n)
## # A tibble: 52 × 6
## # Groups:   Region, Study, Site, ST, K_locus [52]
##    Region          Study         Site ST    K_locus     n
##    <chr>           <chr>        <dbl> <chr> <chr>   <int>
##  1 Southern Asia   CHRF            37 ST11  KL15       59
##  2 Eastern Africa  BARNARDS        17 ST37  KL15       28
##  3 Eastern Africa  MLW             33 ST340 KL15       18
##  4 Southern Asia   NeoOBS-India    46 ST11  KL15       18
##  5 Southern Africa BabyGERMS       27 ST152 KL149      16
##  6 Southern Asia   CHRF            37 ST502 KL15       15
##  7 Southern Africa BabyGERMS       28 ST405 KL151       9
##  8 Eastern Africa  MLW             33 ST45  KL15        8
##  9 Southern Asia   CHRF            38 ST11  KL15        7
## 10 Eastern Africa  Kilifi          49 ST585 KL15        6
## # ℹ 42 more rows
data_NNS_sites10 %>% filter(O_genotype=="O4" & Region=="Southern Asia") %>% group_by(ST, K_locus) %>% count() %>% arrange(-n)
## # A tibble: 16 × 3
## # Groups:   ST, K_locus [16]
##    ST        K_locus     n
##    <chr>     <chr>   <int>
##  1 ST11      KL15       91
##  2 ST502     KL15       15
##  3 ST437     KL36       10
##  4 ST340     KL15        5
##  5 ST152     KL149       3
##  6 ST273     KL15        3
##  7 ST2258    KL131       2
##  8 ST37      KL15        2
##  9 ST231     KL144       1
## 10 ST273     KL141       1
## 11 ST3623    KL15        1
## 12 ST392     KL27        1
## 13 ST429     KL27        1
## 14 ST70      KL15        1
## 15 ST726     KL15        1
## 16 ST883-1LV KL15        1
data_NNS_sites10 %>% filter(O_genotype=="O4" & Region=="Southern Asia") %>% group_by(ST, K_locus, Site) %>% count() %>% arrange(-n)
## # A tibble: 27 × 4
## # Groups:   ST, K_locus, Site [27]
##    ST    K_locus  Site     n
##    <chr> <chr>   <dbl> <int>
##  1 ST11  KL15       37    59
##  2 ST11  KL15       46    18
##  3 ST502 KL15       37    15
##  4 ST11  KL15       38     7
##  5 ST11  KL15        2     6
##  6 ST437 KL36       37     6
##  7 ST152 KL149      37     3
##  8 ST273 KL15       37     3
##  9 ST340 KL15       35     2
## 10 ST340 KL15       37     2
## # ℹ 17 more rows
data_NNS_sites10 %>% filter(O_genotype=="O2a.v1") %>% group_by(Region, Study, Site, ST, K_locus) %>% count() %>% arrange(-n)
## # A tibble: 0 × 6
## # Groups:   Region, Study, Site, ST, K_locus [0]
## # ℹ 6 variables: Region <chr>, Study <chr>, Site <dbl>, ST <chr>,
## #   K_locus <chr>, n <int>
data_NNS_sites10 %>% filter(O_genotype=="O2a.v1" & Region=="Southern Asia") %>% group_by(ST, K_locus) %>% count() %>% arrange(-n)
## # A tibble: 0 × 3
## # Groups:   ST, K_locus [0]
## # ℹ 3 variables: ST <chr>, K_locus <chr>, n <int>
data_NNS_sites10 %>% filter(O_genotype=="O2a.v1" & Region=="Southern Asia") %>% group_by(ST, K_locus, Site) %>% count() %>% arrange(-n)
## # A tibble: 0 × 4
## # Groups:   ST, K_locus, Site [0]
## # ℹ 4 variables: ST <chr>, K_locus <chr>, Site <dbl>, n <int>
data_NNS_sites10 %>% filter(O_genotype=="O5") %>% group_by(Region, Study, Site, ST, K_locus) %>% count() %>% arrange(-n)
## # A tibble: 21 × 6
## # Groups:   Region, Study, Site, ST, K_locus [21]
##    Region          Study      Site ST    K_locus     n
##    <chr>           <chr>     <dbl> <chr> <chr>   <int>
##  1 Eastern Africa  Kilifi       49 ST17  KL25       15
##  2 Southern Africa GBS          42 ST17  KL25        9
##  3 Southern Africa BabyGERMS    30 ST17  KL25        7
##  4 Eastern Africa  MLW          33 ST17  KL25        4
##  5 Southern Africa BabyGERMS    29 ST17  KL25        4
##  6 Eastern Africa  Kilifi       49 ST336 KL25        3
##  7 Southern Africa BabyGERMS    28 ST17  KL25        3
##  8 Southern Africa NIMBIplus    59 ST17  KL25        3
##  9 Southern Asia   AKU           8 ST17  KL25        3
## 10 Southern Africa BabyGERMS    27 ST17  KL25        2
## # ℹ 11 more rows
data_NNS_sites10 %>% filter(O_genotype=="O5" & Region=="Southern Africa") %>% group_by(ST, K_locus) %>% count() %>% arrange(-n)
## # A tibble: 3 × 3
## # Groups:   ST, K_locus [3]
##   ST     K_locus     n
##   <chr>  <chr>   <int>
## 1 ST17   KL25       32
## 2 ST2621 KL25        1
## 3 ST3184 KL25        1
data_NNS_sites10 %>% filter(O_genotype=="O5" & Region=="Southern Africa") %>% group_by(ST, K_locus, Site) %>% count() %>% arrange(-n)
## # A tibble: 10 × 4
## # Groups:   ST, K_locus, Site [10]
##    ST     K_locus  Site     n
##    <chr>  <chr>   <dbl> <int>
##  1 ST17   KL25       42     9
##  2 ST17   KL25       30     7
##  3 ST17   KL25       29     4
##  4 ST17   KL25       28     3
##  5 ST17   KL25       59     3
##  6 ST17   KL25       27     2
##  7 ST17   KL25       31     2
##  8 ST17   KL25       32     2
##  9 ST2621 KL25       30     1
## 10 ST3184 KL25       32     1

Global and regional coverage - top20 global - Fig3bi

kbayes_raw_coverage <- getCumulativeCoverageGlobalRegional(kbayes_raw, kbayes$locus_rank) %>%
  mutate(subgroup=fct_relevel(subgroup,c("Global", "Southern Asia", "Eastern Africa", "Southern Africa", "Western Africa")))

coverage_globalTop20_globalRegional <- plotCoverage(kbayes_raw_coverage, kbayes$locus_rank, maxRank=20, cols=region_cols)

coverage_globalTop20_globalRegional

fatal - read posterior estimates for adjusted counts (main) and raw counts (for comparison)

# read raw Bayesian estimates for K - using fatal samples from sites with at least 10 fatal
kbayes_fatal_raw_min10perSite <- parseModelledEstimates(global_post="outputs_core/K_Fatal_min10_raw_28_posterior_global.csv.gz", region_post="outputs_core/K_Fatal_min10_raw_28_posterior_subgroup.csv.gz")
## Rows: 576000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 2304000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
kbayes_fatal_raw_min10perSite_coverage <- getCumulativeCoverage(kbayes$locus_rank, kbayes_fatal_raw_min10perSite$global_post) %>% mutate(type="min10")

ESBL - read posterior estimates for adjusted counts (main) and raw counts (for comparison)

kbayes_esbl_raw_min10perSite <- parseModelledEstimates(global_post="outputs_core/K_ESBL_min10_raw_28_posterior_global.csv.gz", region_post="outputs_core/K_ESBL_min10_raw_28_posterior_subgroup.csv.gz")
## Rows: 960000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 3840000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
kbayes_esbl_raw_min10perSite_coverage <- getCumulativeCoverage(kbayes$locus_rank, kbayes_esbl_raw_min10perSite$global_post) %>% mutate(type="min10")

CP - read posterior estimates for adjusted counts (main) and raw counts (for comparison)

kbayes_cp_raw_min10perSite <- parseModelledEstimates(global_post="outputs_core/K_Carba_min10_raw_28_posterior_global.csv.gz", region_post="outputs_core/K_Carba_min10_raw_28_posterior_subgroup.csv.gz")
## Rows: 588000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1764000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
kbayes_cp_raw_min10perSite_coverage <- getCumulativeCoverage(kbayes$locus_rank, kbayes_cp_raw_min10perSite$global_post) %>% mutate(type="min10")

combine coverage estimates for subgroups (fatal, ESBL, CP) - Fig3ai

kbayes_raw_coverage_subgroups <- kbayes_esbl_raw_min10perSite_coverage %>% mutate(subgroup="ESBL") %>%
  bind_rows(kbayes_cp_raw_min10perSite_coverage %>% mutate(subgroup="CP")) %>%
  bind_rows(kbayes_fatal_raw_min10perSite_coverage %>% mutate(subgroup="Fatal")) %>%
  bind_rows(kbayes_raw_coverage %>% filter(subgroup=="Global") %>% mutate(subgroup="All") ) %>%
  mutate(subgroup=fct_relevel(subgroup,c("All", "Fatal", "ESBL", "CP")))

coverage_globalTop20_subgroups <- plotCoverage(kbayes_raw_coverage_subgroups, kbayes$locus_rank, maxRank=20, cols=group_cols)

coverage_globalTop20_subgroups

take top 8 per region - Fig3aii and Fig3bii

## extract cluster-adjusted coverage per region
K_rank_SA <- kbayes$region_est %>% ungroup %>% filter(subgroup=="Southern Africa") %>% arrange(-mean) %>% mutate(rank=row_number())
K_rank_WA <- kbayes$region_est %>% ungroup %>% filter(subgroup=="Western Africa") %>% arrange(-mean) %>% mutate(rank=row_number())
K_rank_EA <- kbayes$region_est %>% ungroup %>% filter(subgroup=="Eastern Africa") %>% arrange(-mean) %>% mutate(rank=row_number())
K_rank_SAs <- kbayes$region_est %>% ungroup %>% filter(subgroup=="Southern Asia") %>% arrange(-mean) %>% mutate(rank=row_number())


# get top-8 within each region - gives 20 loci
n<-8

topN <- bind_rows(K_rank_SA, K_rank_EA) %>% bind_rows(K_rank_WA) %>% bind_rows(K_rank_SAs) %>% filter(rank<=n) %>% pull(locus) %>% unique()

length(unique(topN))
## [1] 20
k_rank_region8 <- kbayes$locus_rank[kbayes$locus_rank$locus %in% topN,] %>% 
  mutate(rank=1:(length(unique(topN))))

# calculate global & regional coverage for these K - Fig 3bii
kbayes_raw_coverage_top8perRegion <- getCumulativeCoverageGlobalRegional(kbayes_raw, k_rank_region8) %>%
  mutate(subgroup=fct_relevel(subgroup,c("Global", "Southern Asia", "Eastern Africa", "Southern Africa", "Western Africa")))

coverage_global_top8perRegion <- plotCoverage(kbayes_raw_coverage_top8perRegion, k_rank_region8, maxRank=nrow(k_rank_region8), cols=region_cols)

coverage_global_top8perRegion

# calculate subgroup coverage for these K - Fig 3aii
kbayes_fatal_raw_min10perSite_coverage_top8perRegion <- getCumulativeCoverage(k_rank_region8, kbayes_fatal_raw_min10perSite$global_post)
kbayes_esbl_raw_min10perSite_coverage_top8perRegion <- getCumulativeCoverage(k_rank_region8, kbayes_esbl_raw_min10perSite$global_post)
kbayes_cp_raw_min10perSite_coverage_top8perRegion <- getCumulativeCoverage(k_rank_region8, kbayes_cp_raw_min10perSite$global_post)

kbayes_raw_coverage_subgroups_top8perRegion <- kbayes_esbl_raw_min10perSite_coverage_top8perRegion %>% mutate(subgroup="ESBL") %>%
  bind_rows(kbayes_cp_raw_min10perSite_coverage_top8perRegion %>% mutate(subgroup="CP")) %>%
  bind_rows(kbayes_fatal_raw_min10perSite_coverage_top8perRegion %>% mutate(subgroup="Fatal")) %>%
  bind_rows(kbayes_raw_coverage_top8perRegion %>% filter(subgroup=="Global") %>% mutate(subgroup="All") ) %>%
  mutate(subgroup=fct_relevel(subgroup,c("All", "Fatal", "ESBL", "CP")))

coverage_subgroups_top8perRegion <- plotCoverage(kbayes_raw_coverage_subgroups_top8perRegion, k_rank_region8, maxRank=nrow(k_rank_region8), cols=group_cols)

coverage_subgroups_top8perRegion

(coverage_globalTop20_subgroups + ggtitle("a) Global coverage, by case type", subtitle="KL set 1: global top-20") + coverage_subgroups_top8perRegion + ggtitle("", subtitle="KL set 2: top-8 per region") + patchwork::plot_layout(guides="collect", axes="collect")) / (coverage_globalTop20_globalRegional + ggtitle("b) Per-region coverage", subtitle="KL set 1: global top-20") + coverage_global_top8perRegion + ggtitle("", subtitle="KL set 2: top-8 per region") + patchwork::plot_layout(guides="collect", axes="collect"))

ggsave("figures/Fig3_Kcoverage_raw.pdf", width=8, height=8)
ggsave("figures/Fig3_Kcoverage_raw.png", width=8, height=8)

Table 2 - coverage stats for all/ESBL/CP/fatal, for global and regional

kbayes_fatal_raw_min10perSite_coverageRegional <- getCumulativeCoverageGlobalRegional(ranks=kbayes$locus_rank, bayes=kbayes_fatal_raw_min10perSite, maxRank=20)

kbayes_esbl_raw_min10perSite_coverageRegional <- getCumulativeCoverageGlobalRegional(ranks=kbayes$locus_rank, bayes=kbayes_esbl_raw_min10perSite, maxRank=20)

kbayes_cp_raw_min10perSite_coverageRegional <- getCumulativeCoverageGlobalRegional(ranks=kbayes$locus_rank, bayes=kbayes_cp_raw_min10perSite, maxRank=20)

table2 <- kbayes_raw_coverage %>% mutate(cases="All") %>%
  bind_rows(kbayes_fatal_raw_min10perSite_coverageRegional %>% mutate(cases="  Fatal infections")) %>%
  bind_rows(kbayes_esbl_raw_min10perSite_coverageRegional %>% mutate(cases="  ESBL infections")) %>%
  bind_rows(kbayes_cp_raw_min10perSite_coverageRegional %>% mutate(cases="  CP infections")) %>%
  filter(rank %in% c(5,10,15,20)) %>% 
  mutate(upper=if_else(upper>1, 1, upper)) %>%
  mutate(summary=paste0(round(mean,3)*100," (",round(lower,3)*100,"-",round(upper,3)*100,")")) %>% 
  select(rank, subgroup, cases, summary) %>% 
  pivot_wider(names_from=rank, values_from=summary, id_cols=c(subgroup, cases)) %>%
  rename(Region=subgroup)

add totals per subgroup

fatal_counts <- data_NNS_sites10 %>% filter(Mortality==1) %>% group_by(Region) %>% count()
fatal_counts <- fatal_counts %>% bind_rows(list(Region="Global", n=sum(fatal_counts$n)))

esbl_counts <- data_NNS_sites10 %>% filter(resistance_score>=1) %>% group_by(Region) %>% count()
esbl_counts <- esbl_counts %>% bind_rows(list(Region="Global", n=sum(esbl_counts$n)))

cp_counts <- data_NNS_sites10 %>% filter(resistance_score>=2) %>% group_by(Region) %>% count()
cp_counts <- cp_counts %>% bind_rows(list(Region="Global", n=sum(cp_counts$n)))

total_counts <- data_NNS_sites10 %>% group_by(Region) %>% count()
total_counts <- total_counts %>% bind_rows(list(Region="Global", n=sum(total_counts$n)))
table2 <- esbl_counts %>% mutate(cases="  ESBL infections") %>%
  bind_rows(cp_counts%>% mutate(cases="  CP infections")) %>%
  bind_rows(fatal_counts %>% mutate(cases="  Fatal infections")) %>%
  bind_rows(total_counts %>% mutate(cases="All")) %>%
  full_join(table2, by=c("Region", "cases")) %>%
  mutate(cases=fct_relevel(cases,c("All", "  Fatal infections", "  ESBL infections", "  CP infections")))

# sort function isn't working when ordering region factor using fct_relevel
table2$Region <- factor(table2$Region, levels = c("Global", "Eastern Africa", "Southern Africa", "Western Africa", "Southern Asia"))

table2 <- table2 %>%
  arrange(Region, cases) %>%
  relocate(n, .after=cases) %>% 
  mutate(cases=if_else(cases=="All", Region, cases)) %>%
  rename_with(~ paste0(.x, " KL"), c("5", "10", "15", "20")) %>%
  rename(N=n) %>%
  rename(Data=cases) %>%
  ungroup %>% select(-Region)
  
write_tsv(table2, file="tables/Table2_coverage.tsv")

Table S4 - K set2 coverage stats for all/ESBL/CP/fatal, for global and regional

kbayes_fatal_raw_min10perSite_coverageRegional_top8perRegion <- getCumulativeCoverageGlobalRegional(ranks=k_rank_region8, bayes=kbayes_fatal_raw_min10perSite, maxRank=20)

kbayes_esbl_raw_min10perSite_coverageRegional_top8perRegion <- getCumulativeCoverageGlobalRegional(ranks=k_rank_region8, bayes=kbayes_esbl_raw_min10perSite, maxRank=20)

kbayes_cp_raw_min10perSite_coverageRegional_top8perRegion <- getCumulativeCoverageGlobalRegional(ranks=k_rank_region8, bayes=kbayes_cp_raw_min10perSite, maxRank=20)

tableS4 <- kbayes_raw_coverage_top8perRegion %>% mutate(cases="All") %>%
  bind_rows(kbayes_fatal_raw_min10perSite_coverageRegional_top8perRegion %>% mutate(cases="  Fatal infections")) %>%
  bind_rows(kbayes_esbl_raw_min10perSite_coverageRegional_top8perRegion %>% mutate(cases="  ESBL infections")) %>%
  bind_rows(kbayes_cp_raw_min10perSite_coverageRegional_top8perRegion %>% mutate(cases="  CP infections")) %>%
  mutate(upper=if_else(upper>1, 1, upper)) %>%
  filter(rank %in% c(5,10,15,20)) %>% 
  mutate(summary=paste0(round(mean,3)*100," (",round(lower,3)*100,"-",round(upper,3)*100,")")) %>% 
  select(rank, subgroup, cases, summary) %>% 
  pivot_wider(names_from=rank, values_from=summary, id_cols=c(subgroup, cases)) %>%
  rename(Region=subgroup)
tableS4 <- esbl_counts %>% mutate(cases="  ESBL infections") %>%
  bind_rows(cp_counts%>% mutate(cases="  CP infections")) %>%
  bind_rows(fatal_counts %>% mutate(cases="  Fatal infections")) %>%
  bind_rows(total_counts %>% mutate(cases="All")) %>%
  full_join(tableS4, by=c("Region", "cases")) %>%
  mutate(cases=fct_relevel(cases,c("All", "  Fatal infections", "  ESBL infections", "  CP infections")))

# sort function isn't working when ordering region factor using fct_relevel
tableS4$Region <- factor(tableS4$Region, levels = c("Global", "Eastern Africa", "Southern Africa", "Western Africa", "Southern Asia"))

tableS4 <- tableS4 %>%
  arrange(Region, cases) %>%
  relocate(n, .after=cases) %>% 
  mutate(cases=if_else(cases=="All", Region, cases)) %>%
  rename_with(~ paste0(.x, " KL"), c("5", "10", "15", "20")) %>%
  rename(N=n) %>%
  rename(Data=cases) %>%
  ungroup %>% select(-Region)
  
write_tsv(tableS4, file="tables/TableS4_coverageSet2.tsv")

Fig S4 - coverage for region-focused K sets

# ranked by Southern Africa
kbayes_raw_coverage_top20SA <- getCumulativeCoverageGlobalRegional(kbayes_raw, K_rank_SA %>% select(locus, rank), maxRank=20) %>%
  mutate(subgroup=fct_relevel(subgroup,c("Global", "Southern Asia", "Eastern Africa", "Southern Africa", "Western Africa")))

kbayes_raw_coverage_globalRegional_top20SA <- plotCoverage(kbayes_raw_coverage_top20SA, K_rank_SA %>% select(locus, rank), maxRank=20, cols=region_cols)

# ranked by Eastern Africa
kbayes_raw_coverage_top20EA <- getCumulativeCoverageGlobalRegional(kbayes_raw, K_rank_EA %>% select(locus, rank), maxRank=20) %>%
  mutate(subgroup=fct_relevel(subgroup,c("Global", "Southern Asia", "Eastern Africa", "Southern Africa", "Western Africa")))

kbayes_raw_coverage_globalRegional_top20EA <- plotCoverage(kbayes_raw_coverage_top20EA, K_rank_EA %>% select(locus, rank), maxRank=20, cols=region_cols)

# ranked by Western Africa
kbayes_raw_coverage_top20WA <- getCumulativeCoverageGlobalRegional(kbayes_raw, K_rank_WA %>% select(locus, rank), maxRank=20) %>%
  mutate(subgroup=fct_relevel(subgroup,c("Global", "Southern Asia", "Eastern Africa", "Southern Africa", "Western Africa")))

kbayes_raw_coverage_globalRegional_top20WA <- plotCoverage(kbayes_raw_coverage_top20WA, K_rank_WA %>% select(locus, rank), maxRank=20, cols=region_cols)

# ranked by Southern Asia
kbayes_raw_coverage_top20SAs <- getCumulativeCoverageGlobalRegional(kbayes_raw, K_rank_SAs %>% select(locus, rank), maxRank=20) %>%
  mutate(subgroup=fct_relevel(subgroup,c("Global", "Southern Asia", "Eastern Africa", "Southern Africa", "Western Africa")))

kbayes_raw_coverage_globalRegional_top20SAs <- plotCoverage(kbayes_raw_coverage_top20SAs, K_rank_SAs %>% select(locus, rank), maxRank=20, cols=region_cols)

(kbayes_raw_coverage_globalRegional_top20SA + ggtitle("Top-20 K loci in S. Africa") + kbayes_raw_coverage_globalRegional_top20EA + ggtitle("Top-20 K loci in E. Africa")) / (kbayes_raw_coverage_globalRegional_top20WA + ggtitle("Top-20 K loci in W. Africa") + kbayes_raw_coverage_globalRegional_top20SAs + ggtitle("Top-20 K loci in S. Asia")) + patchwork::plot_layout(guides="collect")

ggsave("figures/FigS4_KcoverageRaw_top20PerRegion.pdf", width=8, height=8)
ggsave("figures/FigS4_KcoverageRaw_top20PerRegion.png", width=8, height=8)

numbers for text - K coverage

# coverage of top-20
kbayes_raw_coverage %>% filter(rank %in% c(3,5,10,15,20) & subgroup=="Global") %>% select(mean, lower, upper) %>% mutate(rank=c(3,5,10,15,20))
## # A tibble: 5 × 4
##    mean lower upper  rank
##   <dbl> <dbl> <dbl> <dbl>
## 1 0.351 0.328 0.376     3
## 2 0.488 0.460 0.517     5
## 3 0.587 0.556 0.619    10
## 4 0.670 0.636 0.704    15
## 5 0.728 0.693 0.764    20
kbayes_raw_coverage %>% filter(rank==10 & subgroup=="Global") %>% select(mean, lower, upper) - (kbayes_raw_coverage %>% filter(rank==5 & subgroup=="Global") %>% select(mean, lower, upper))
##         mean      lower     upper
## 1 0.09899569 0.09608937 0.1023605
kbayes_raw_coverage %>% filter(rank==15 & subgroup=="Global") %>% select(mean, lower, upper) - (kbayes_raw_coverage %>% filter(rank==10 & subgroup=="Global") %>% select(mean, lower, upper))
##         mean      lower     upper
## 1 0.08240133 0.08031075 0.0843404
kbayes_raw_coverage %>% filter(rank==20 & subgroup=="Global") %>% select(mean, lower, upper) - (kbayes_raw_coverage %>% filter(rank==15 & subgroup=="Global") %>% select(mean, lower, upper))
##         mean      lower      upper
## 1 0.05813391 0.05676994 0.05986134
# available data on fatal cases
data_NNS_sites10 %>% filter(!is.na(Mortality)) %>% nrow()
## [1] 1063
data_NNS_sites10 %>% filter(Mortality==1) %>% nrow()
## [1] 394
# coverage of region-focused subsets
kbayes_raw_coverage_globalRegional_top20SA$data %>% filter(rank==20)
## # A tibble: 5 × 7
##    rank  mean median lower upper locus subgroup       
##   <int> <dbl>  <dbl> <dbl> <dbl> <chr> <fct>          
## 1    20 0.709  0.709 0.675 0.744 KL28  Global         
## 2    20 0.735  0.735 0.690 0.783 KL28  Eastern Africa 
## 3    20 0.569  0.568 0.509 0.633 KL28  Southern Asia  
## 4    20 0.904  0.903 0.794 1     KL28  Southern Africa
## 5    20 0.601  0.596 0.451 0.774 KL28  Western Africa
kbayes_raw_coverage_globalRegional_top20EA$data %>% filter(rank==20)
## # A tibble: 5 × 7
##    rank  mean median lower upper locus subgroup       
##   <int> <dbl>  <dbl> <dbl> <dbl> <chr> <fct>          
## 1    20 0.715  0.715 0.681 0.751 KL64  Global         
## 2    20 0.810  0.810 0.762 0.861 KL64  Eastern Africa 
## 3    20 0.576  0.575 0.516 0.642 KL64  Southern Asia  
## 4    20 0.590  0.589 0.503 0.683 KL64  Southern Africa
## 5    20 0.622  0.618 0.470 0.796 KL64  Western Africa
kbayes_raw_coverage_globalRegional_top20WA$data %>% filter(rank==20)
## # A tibble: 5 × 7
##    rank  mean median lower upper locus subgroup       
##   <int> <dbl>  <dbl> <dbl> <dbl> <chr> <fct>          
## 1    20 0.696  0.696 0.662 0.730 KL30  Global         
## 2    20 0.786  0.786 0.739 0.836 KL30  Eastern Africa 
## 3    20 0.532  0.531 0.474 0.593 KL30  Southern Asia  
## 4    20 0.581  0.579 0.494 0.672 KL30  Southern Africa
## 5    20 0.798  0.794 0.620 0.995 KL30  Western Africa
kbayes_raw_coverage_globalRegional_top20SAs$data %>% filter(rank==20)
## # A tibble: 5 × 7
##    rank  mean median lower upper locus subgroup       
##   <int> <dbl>  <dbl> <dbl> <dbl> <chr> <fct>          
## 1    20 0.690  0.690 0.656 0.725 KL54  Global         
## 2    20 0.682  0.681 0.638 0.727 KL54  Eastern Africa 
## 3    20 0.807  0.806 0.735 0.883 KL54  Southern Asia  
## 4    20 0.566  0.565 0.481 0.657 KL54  Southern Africa
## 5    20 0.450  0.446 0.322 0.600 KL54  Western Africa
# coverage of KL set 2
kbayes_raw_coverage_top8perRegion%>% filter(rank==20)
## # A tibble: 5 × 7
##    rank  mean median lower upper locus subgroup       
##   <int> <dbl>  <dbl> <dbl> <dbl> <chr> <fct>          
## 1    20 0.729  0.729 0.694 0.765 KL53  Global         
## 2    20 0.715  0.715 0.670 0.763 KL53  Eastern Africa 
## 3    20 0.719  0.718 0.651 0.791 KL53  Southern Asia  
## 4    20 0.822  0.821 0.721 0.928 KL53  Southern Africa
## 5    20 0.698  0.695 0.533 0.882 KL53  Western Africa
kbayes_raw_coverage_subgroups_top8perRegion %>% filter(rank==20)
## # A tibble: 4 × 7
##    rank  mean median lower upper locus subgroup
##   <int> <dbl>  <dbl> <dbl> <dbl> <chr> <fct>   
## 1    20 0.749  0.749 0.712 0.788 KL53  ESBL    
## 2    20 0.815  0.815 0.742 0.892 KL53  CP      
## 3    20 0.706  0.705 0.628 0.785 KL53  Fatal   
## 4    20 0.729  0.729 0.694 0.765 KL53  All
# what is extra in set2 that is not in top-20?
k_rank_region8 %>% left_join(kbayes$locus, by="locus") %>% filter(rank.y>20)
## # A tibble: 5 × 3
##   locus rank.x rank.y
##   <chr>  <int>  <int>
## 1 KL28      16     22
## 2 KL8       17     23
## 3 KL81      18     26
## 4 KL116     19     36
## 5 KL53      20     48

Global and regional O coverage - top10 global - Fig4d

obayes_raw_coverage <- getCumulativeCoverageGlobalRegional(obayes_raw, obayes$locus_rank) %>%
  mutate(subgroup=fct_relevel(subgroup,c("Global", "Southern Asia", "Eastern Africa", "Southern Africa", "Western Africa")))

ocoverage_globalTop10_globalRegional <- plotCoverage(obayes_raw_coverage, obayes$locus_rank, maxRank=10, xintercept=c(5,10), cols=region_cols, col_title = "d) Region")

ocoverage_globalTop10_globalRegional

O fatal - read posterior estimates for adjusted counts (main) and raw counts (for comparison)

obayes_fatal_raw_min10perSite <- parseModelledEstimates(global_post="outputs_core/O_Fatal_min10_raw_28_posterior_global.csv.gz", region_post="outputs_core/O_Fatal_min10_raw_28_posterior_subgroup.csv.gz", fixOnames = T)
## Rows: 132000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 528000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
obayes_fatal_raw_min10perSite_coverage <- getCumulativeCoverage(obayes$locus_rank, obayes_fatal_raw_min10perSite$global_post) %>% mutate(type="min10")

ESBL - read posterior estimates for adjusted counts (main) and raw counts (for comparison)

obayes_esbl_raw_min10perSite <- parseModelledEstimates(global_post="outputs_core/O_ESBL_min10_raw_28_posterior_global.csv.gz", region_post="outputs_core/O_ESBL_min10_raw_28_posterior_subgroup.csv.gz", fixOnames = T)
## Rows: 156000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 624000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
obayes_esbl_raw_min10perSite_coverage <- getCumulativeCoverage(obayes$locus_rank, obayes_esbl_raw_min10perSite$global_post) %>% mutate(type="min10")

CP - read posterior estimates for adjusted counts (main) and raw counts (for comparison)

obayes_cp_raw_min10perSite <- parseModelledEstimates(global_post="outputs_core/O_Carba_min10_raw_28_posterior_global.csv.gz", region_post="outputs_core/O_Carba_min10_raw_28_posterior_subgroup.csv.gz", fixOnames = T)
## Rows: 120000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 360000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
obayes_cp_raw_min10perSite_coverage <- getCumulativeCoverage(obayes$locus_rank, obayes_cp_raw_min10perSite$global_post) %>% mutate(type="min10")

combine coverage estimates for subgroups (fatal, ESBL, CP) - Fig4c

obayes_raw_coverage_subgroups <- obayes_esbl_raw_min10perSite_coverage %>% mutate(subgroup="ESBL") %>%
  bind_rows(obayes_cp_raw_min10perSite_coverage %>% mutate(subgroup="CP")) %>%
  bind_rows(obayes_fatal_raw_min10perSite_coverage %>% mutate(subgroup="Fatal")) %>%
  bind_rows(obayes_raw_coverage %>% filter(subgroup=="Global") %>% mutate(subgroup="All") ) %>%
  mutate(subgroup=fct_relevel(subgroup,c("All", "Fatal", "ESBL", "CP")))

o_coverage_globalTop10_subgroups <- plotCoverage(obayes_raw_coverage_subgroups, obayes$locus_rank, maxRank=10, xintercept=c(5,10), cols=group_cols, col_title = "c) Subgroup")

o_coverage_globalTop10_subgroups

(o_prev_global_dist + o_prev_region_heatmap) / (o_coverage_globalTop10_subgroups + ggtitle("c) Global coverage") + ocoverage_globalTop10_globalRegional + ggtitle("d) Regional coverage") + patchwork::plot_layout(guides="collect"))
## Picking joint bandwidth of 0.0901
## Warning: Removed 257 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

ggsave("figures/Fig4_O_prevAdj_coverageRaw.pdf", width=8, height=8, device = cairo_pdf, family = "Arial Unicode MS")
## Picking joint bandwidth of 0.0901
## Warning: Removed 257 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
ggsave("figures/Fig4_O_prevAdj_coverageRaw.png", width=8, height=8)
## Picking joint bandwidth of 0.0901
## Warning: Removed 257 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

numbers for text - O prevalence and coverage

# O prevalence
obayes$global_est
## # A tibble: 15 × 6
##    locus      median    mean     lower   upper  rank
##    <chr>       <dbl>   <dbl>     <dbl>   <dbl> <int>
##  1 O1⍺β,2⍺  0.267    0.267   0.236     0.299       1
##  2 O1⍺β,2β  0.218    0.218   0.191     0.248       2
##  3 O2β      0.158    0.158   0.135     0.184       3
##  4 O4       0.0905   0.0908  0.0720    0.111       4
##  5 O2⍺      0.0611   0.0615  0.0458    0.0794      5
##  6 O3𝛾      0.0597   0.0601  0.0449    0.0775      6
##  7 O5       0.0535   0.0539  0.0394    0.0702      7
##  8 O13      0.0487   0.0490  0.0352    0.0647      8
##  9 O3⍺/O3β  0.0279   0.0283  0.0181    0.0405      9
## 10 O15      0.00329  0.00369 0.000744  0.00893    10
## 11 O12      0.00209  0.00248 0.000311  0.00683    11
## 12 O10      0.00206  0.00246 0.000282  0.00691    12
## 13 O14      0.000853 0.00123 0.0000318 0.00442    13
## 14 O2⍺𝛾     0.000844 0.00122 0.0000334 0.00439    14
## 15 O1⍺β,2⍺𝛾 0.000843 0.00121 0.0000312 0.00446    15
obayes$region_est
## # A tibble: 60 × 6
## # Groups:   locus [15]
##    locus subgroup         median    mean    lower   upper
##    <chr> <chr>             <dbl>   <dbl>    <dbl>   <dbl>
##  1 O10   Eastern Africa  0.00133 0.00176 0.000144 0.00587
##  2 O10   Southern Africa 0.00131 0.00189 0.000121 0.00720
##  3 O10   Southern Asia   0.00310 0.00395 0.000379 0.0123 
##  4 O10   Western Africa  0.00152 0.00229 0.000145 0.00886
##  5 O12   Eastern Africa  0.00179 0.00231 0.000218 0.00739
##  6 O12   Southern Africa 0.00135 0.00192 0.000129 0.00704
##  7 O12   Southern Asia   0.00239 0.00314 0.000295 0.0101 
##  8 O12   Western Africa  0.00155 0.00235 0.000146 0.00910
##  9 O13   Eastern Africa  0.0461  0.0469  0.0289   0.0695 
## 10 O13   Southern Africa 0.0158  0.0172  0.00436  0.0381 
## # ℹ 50 more rows
obayes$region_est %>% filter(locus=="O2afg")
## # A tibble: 0 × 6
## # Groups:   locus [0]
## # ℹ 6 variables: locus <chr>, subgroup <chr>, median <dbl>, mean <dbl>,
## #   lower <dbl>, upper <dbl>
obayes$region_est %>% filter(locus=="O2a")
## # A tibble: 0 × 6
## # Groups:   locus [0]
## # ℹ 6 variables: locus <chr>, subgroup <chr>, median <dbl>, mean <dbl>,
## #   lower <dbl>, upper <dbl>
obayes$region_est %>% filter(locus=="O4")
## # A tibble: 4 × 6
## # Groups:   locus [1]
##   locus subgroup        median   mean  lower upper
##   <chr> <chr>            <dbl>  <dbl>  <dbl> <dbl>
## 1 O4    Eastern Africa  0.0724 0.0732 0.0497 0.101
## 2 O4    Southern Africa 0.0705 0.0724 0.0398 0.115
## 3 O4    Southern Asia   0.137  0.138  0.0984 0.184
## 4 O4    Western Africa  0.0540 0.0574 0.0192 0.116
obayes$region_est %>% filter(locus=="O5")
## # A tibble: 4 × 6
## # Groups:   locus [1]
##   locus subgroup        median   mean   lower  upper
##   <chr> <chr>            <dbl>  <dbl>   <dbl>  <dbl>
## 1 O5    Eastern Africa  0.0322 0.0330 0.0181  0.0523
## 2 O5    Southern Africa 0.119  0.121  0.0743  0.176 
## 3 O5    Southern Asia   0.0482 0.0495 0.0272  0.0782
## 4 O5    Western Africa  0.0276 0.0305 0.00729 0.0704
obayes$region_est %>% filter(locus=="O3b")
## # A tibble: 0 × 6
## # Groups:   locus [0]
## # ℹ 6 variables: locus <chr>, subgroup <chr>, median <dbl>, mean <dbl>,
## #   lower <dbl>, upper <dbl>
obayes$region_est %>% filter(locus=="OL13")
## # A tibble: 0 × 6
## # Groups:   locus [0]
## # ℹ 6 variables: locus <chr>, subgroup <chr>, median <dbl>, mean <dbl>,
## #   lower <dbl>, upper <dbl>
# coverage
obayes_raw_coverage
## # A tibble: 150 × 7
##     rank  mean median lower upper locus   subgroup
##    <int> <dbl>  <dbl> <dbl> <dbl> <chr>   <fct>   
##  1     1 0.259  0.259 0.240 0.278 O1⍺β,2⍺ Global  
##  2     2 0.437  0.437 0.411 0.463 O1⍺β,2β Global  
##  3     3 0.681  0.681 0.649 0.713 O2β     Global  
##  4     4 0.811  0.811 0.776 0.846 O4      Global  
##  5     5 0.862  0.862 0.826 0.899 O2⍺     Global  
##  6     6 0.890  0.890 0.853 0.927 O3𝛾     Global  
##  7     7 0.925  0.925 0.888 0.964 O5      Global  
##  8     8 0.973  0.972 0.934 1.01  O13     Global  
##  9     9 0.991  0.991 0.951 1.03  O3⍺/O3β Global  
## 10    10 0.992  0.992 0.952 1.03  O15     Global  
## # ℹ 140 more rows
# STs with O4 or O2a in Southern Asia
data_NNS_sites10 %>% filter(Region=="Southern Asia" & O_genotype=="O4") %>% group_by(ST, K_locus)%>% count()
## # A tibble: 16 × 3
## # Groups:   ST, K_locus [16]
##    ST        K_locus     n
##    <chr>     <chr>   <int>
##  1 ST11      KL15       91
##  2 ST152     KL149       3
##  3 ST2258    KL131       2
##  4 ST231     KL144       1
##  5 ST273     KL141       1
##  6 ST273     KL15        3
##  7 ST340     KL15        5
##  8 ST3623    KL15        1
##  9 ST37      KL15        2
## 10 ST392     KL27        1
## 11 ST429     KL27        1
## 12 ST437     KL36       10
## 13 ST502     KL15       15
## 14 ST70      KL15        1
## 15 ST726     KL15        1
## 16 ST883-1LV KL15        1
data_NNS_sites10 %>% filter(Region=="Southern Asia" & O_genotype=="O4") %>% group_by(ST, K_locus, Site)%>% count()
## # A tibble: 27 × 4
## # Groups:   ST, K_locus, Site [27]
##    ST     K_locus  Site     n
##    <chr>  <chr>   <dbl> <int>
##  1 ST11   KL15        2     6
##  2 ST11   KL15       35     1
##  3 ST11   KL15       37    59
##  4 ST11   KL15       38     7
##  5 ST11   KL15       46    18
##  6 ST152  KL149      37     3
##  7 ST2258 KL131      35     1
##  8 ST2258 KL131      38     1
##  9 ST231  KL144       8     1
## 10 ST273  KL141      37     1
## # ℹ 17 more rows
data_NNS_sites10 %>% filter(Region=="Southern Asia" & O_genotype=="O2a") %>% group_by(ST, K_locus)%>% count()
## # A tibble: 0 × 3
## # Groups:   ST, K_locus [0]
## # ℹ 3 variables: ST <chr>, K_locus <chr>, n <int>
data_NNS_sites10 %>% filter(Region=="Southern Asia" & O_genotype=="O2a") %>% group_by(ST, K_locus, Site)%>% count()
## # A tibble: 0 × 4
## # Groups:   ST, K_locus, Site [0]
## # ℹ 4 variables: ST <chr>, K_locus <chr>, Site <dbl>, n <int>
data_NNS_sites10 %>% filter(Region=="Southern Africa" & O_genotype=="O5") %>% group_by(ST, K_locus)%>% count()
## # A tibble: 3 × 3
## # Groups:   ST, K_locus [3]
##   ST     K_locus     n
##   <chr>  <chr>   <int>
## 1 ST17   KL25       32
## 2 ST2621 KL25        1
## 3 ST3184 KL25        1
data_NNS_sites10 %>% filter(Region=="Southern Africa" & O_genotype=="O5") %>% group_by(ST, K_locus, Site, Country)%>% count()
## # A tibble: 10 × 5
## # Groups:   ST, K_locus, Site, Country [10]
##    ST     K_locus  Site Country          n
##    <chr>  <chr>   <dbl> <chr>        <int>
##  1 ST17   KL25       27 South Africa     2
##  2 ST17   KL25       28 South Africa     3
##  3 ST17   KL25       29 South Africa     4
##  4 ST17   KL25       30 South Africa     7
##  5 ST17   KL25       31 South Africa     2
##  6 ST17   KL25       32 South Africa     2
##  7 ST17   KL25       42 South Africa     9
##  8 ST17   KL25       59 Botswana         3
##  9 ST2621 KL25       30 South Africa     1
## 10 ST3184 KL25       32 South Africa     1
# top-5 coverage
obayes_raw_coverage %>% filter(rank==5)
## # A tibble: 5 × 7
##    rank  mean median lower upper locus subgroup       
##   <int> <dbl>  <dbl> <dbl> <dbl> <chr> <fct>          
## 1     5 0.862  0.862 0.826 0.899 O2⍺   Global         
## 2     5 0.920  0.920 0.872 0.969 O2⍺   Eastern Africa 
## 3     5 0.760  0.759 0.692 0.831 O2⍺   Southern Asia  
## 4     5 0.835  0.832 0.669 1.02  O2⍺   Western Africa 
## 5     5 0.813  0.812 0.714 0.917 O2⍺   Southern Africa
obayes_raw_coverage_subgroups %>% filter(rank==5)
## # A tibble: 4 × 8
##    rank  mean median lower upper locus type  subgroup
##   <int> <dbl>  <dbl> <dbl> <dbl> <chr> <chr> <fct>   
## 1     5 0.875  0.875 0.836 0.914 O2⍺   min10 ESBL    
## 2     5 0.775  0.774 0.706 0.845 O2⍺   min10 CP      
## 3     5 0.895  0.895 0.813 0.980 O2⍺   min10 Fatal   
## 4     5 0.862  0.862 0.826 0.899 O2⍺   <NA>  All

Table S5 - O coverage stats for all/ESBL/CP/fatal, for global and regional

obayes_fatal_raw_min10perSite_coverageRegional <- getCumulativeCoverageGlobalRegional(ranks=obayes$locus_rank, bayes=obayes_fatal_raw_min10perSite, maxRank=10)

obayes_esbl_raw_min10perSite_coverageRegional <- getCumulativeCoverageGlobalRegional(ranks=obayes$locus_rank, bayes=obayes_esbl_raw_min10perSite, maxRank=10)

obayes_cp_raw_min10perSite_coverageRegional <- getCumulativeCoverageGlobalRegional(ranks=obayes$locus_rank, bayes=obayes_cp_raw_min10perSite, maxRank=10)

tableS5 <- obayes_raw_coverage %>% filter(rank <= 10) %>% mutate(cases="All") %>%
  bind_rows(obayes_fatal_raw_min10perSite_coverageRegional %>% mutate(cases="  Fatal infections")) %>%
  bind_rows(obayes_esbl_raw_min10perSite_coverageRegional %>% mutate(cases="  ESBL infections")) %>%
  bind_rows(obayes_cp_raw_min10perSite_coverageRegional %>% mutate(cases="  CP infections")) %>%
  mutate(upper=if_else(upper>1, 1, upper)) %>%
  mutate(summary=paste0(round(mean,3)*100," (",round(lower,3)*100,"-",round(upper,3)*100,")")) %>% 
  select(locus, subgroup, cases, summary) %>% 
  pivot_wider(names_from=locus, values_from=summary, id_cols=c(subgroup, cases)) %>%
  rename(Region=subgroup)

add totals per subgroup

tableS5 <- esbl_counts %>% mutate(cases="  ESBL infections") %>%
  bind_rows(cp_counts %>% mutate(cases="  CP infections")) %>%
  bind_rows(fatal_counts %>% mutate(cases="  Fatal infections")) %>%
  bind_rows(total_counts %>% mutate(cases="All")) %>%
  full_join(tableS5, by=c("Region", "cases")) %>%
  mutate(cases=fct_relevel(cases,c("All", "  Fatal infections", "  ESBL infections", "  CP infections")))

# sort function isn't working when ordering region factor using fct_relevel
tableS5$Region <- factor(tableS5$Region, levels = c("Global", "Eastern Africa", "Southern Africa", "Western Africa", "Southern Asia"))

tableS5 <- tableS5 %>%
  arrange(Region, cases) %>%
  relocate(n, .after=cases) %>% 
  mutate(cases=if_else(cases=="All", Region, cases)) %>%
  rename(N=n) %>%
  rename(Data=cases) %>%
  ungroup %>% select(-Region)
  
write_tsv(tableS5, file="tables/TableS5_coverage.tsv")

numbers for text - effect of cluster adjustment

kbayes_global %>% mutate(diff=mean.raw-mean.adj) %>% select(locus, mean.raw, mean.adj, diff, rank.raw, rank.adj) %>% arrange(-diff)
## # A tibble: 90 × 6
##    locus mean.raw mean.adj    diff rank.raw rank.adj
##    <chr>    <dbl>    <dbl>   <dbl>    <int>    <int>
##  1 KL102  0.177    0.0875  0.0897         1        2
##  2 KL15   0.101    0.0579  0.0431         3        4
##  3 KL2    0.120    0.0898  0.0299         2        1
##  4 KL104  0.0331   0.00859 0.0245         6       30
##  5 KL157  0.0150   0.00124 0.0138        16       73
##  6 KL81   0.0249   0.0123  0.0126         9       26
##  7 KL108  0.0212   0.00978 0.0114        11       29
##  8 KL149  0.0254   0.0197  0.00567        8       13
##  9 KL8    0.0161   0.0124  0.00371       14       23
## 10 KL127  0.00569  0.00369 0.00200       32       53
## # ℹ 80 more rows
data_NNS_sites10 %>% filter(K_locus=="KL102") %>% group_by(Cluster, Country, ST) %>% count() %>% arrange(-n) %>% mutate(p=n/data_NNS_sites10 %>% filter(K_locus=="KL102") %>% nrow()) %>% ungroup() %>% mutate(cum=cumsum(p))
## # A tibble: 71 × 6
##    Cluster   Country      ST         n      p   cum
##    <chr>     <chr>        <chr>  <int>  <dbl> <dbl>
##  1 SZ1       Zambia       ST307    203 0.594  0.594
##  2 CHRF187   Bangladesh   ST147     21 0.0614 0.655
##  3 SZ3       Zambia       ST307     11 0.0322 0.687
##  4 Kilifi141 Kenya        ST3247     8 0.0234 0.711
##  5 BG3       South Africa ST307      5 0.0146 0.725
##  6 CHRF120   Bangladesh   ST307      5 0.0146 0.740
##  7 MB33      Ghana        ST307      5 0.0146 0.754
##  8 BG33      South Africa ST307      4 0.0117 0.766
##  9 ML94      Malawi       ST307      4 0.0117 0.778
## 10 SZ30      Zambia       ST307      4 0.0117 0.789
## # ℹ 61 more rows
data_NNS_sites10 %>% filter(K_locus=="KL2") %>% group_by(Cluster, Country, ST) %>% count() %>% arrange(-n) %>% mutate(p=n/data_NNS_sites10 %>% filter(K_locus=="KL2") %>% nrow()) %>% ungroup() %>% mutate(cum=cumsum(p))
## # A tibble: 73 × 6
##    Cluster                      Country ST        n       p   cum
##    <chr>                        <chr>   <chr> <int>   <dbl> <dbl>
##  1 ML31                         Malawi  ST39     87 0.377   0.377
##  2 ML38                         Malawi  ST14     23 0.0996  0.476
##  3 Kiambu sub-County Hospital30 Kenya   ST14     12 0.0519  0.528
##  4 Mbagathi54                   Kenya   ST14     11 0.0476  0.576
##  5 ML24                         Malawi  ST14      8 0.0346  0.610
##  6 Kilifi86                     Kenya   ST14      7 0.0303  0.641
##  7 ML32                         Malawi  ST25      4 0.0173  0.658
##  8 ML57                         Malawi  ST25      4 0.0173  0.675
##  9 Kilifi125                    Kenya   ST101     3 0.0130  0.688
## 10 MB89                         Malawi  ST25      2 0.00866 0.697
## # ℹ 63 more rows
data_NNS_sites10 %>% filter(K_locus=="KL15") %>% group_by(Cluster, Country, ST) %>% count() %>% arrange(-n) %>% mutate(p=n/data_NNS_sites10 %>% filter(K_locus=="KL15") %>% nrow()) %>% ungroup() %>% mutate(cum=cumsum(p))
## # A tibble: 45 × 6
##    Cluster Country    ST         n      p   cum
##    <chr>   <chr>      <chr>  <int>  <dbl> <dbl>
##  1 CHRF149 Bangladesh ST11      66 0.338  0.338
##  2 BA24    Ethiopia   ST37      26 0.133  0.472
##  3 CHRF169 Bangladesh ST502     14 0.0718 0.544
##  4 NEO2    India      ST11      14 0.0718 0.615
##  5 ML12    Malawi     ST340      7 0.0359 0.651
##  6 ML123   Malawi     ST45       7 0.0359 0.687
##  7 ML16    Malawi     ST340      6 0.0308 0.718
##  8 aiims10 India      ST11       6 0.0308 0.749
##  9 MB51    Zambia     ST5856     4 0.0205 0.769
## 10 BA20    Ethiopia   ST37       3 0.0154 0.785
## # ℹ 35 more rows
data_NNS_sites10 %>% filter(K_locus=="KL104") %>% group_by(Cluster, Country, ST) %>% count() %>% arrange(-n) %>% mutate(p=n/data_NNS_sites10 %>% filter(K_locus=="KL104") %>% nrow()) %>% ungroup() %>% mutate(cum=cumsum(p))
## # A tibble: 8 × 6
##   Cluster   Country  ST             n      p   cum
##   <chr>     <chr>    <chr>      <int>  <dbl> <dbl>
## 1 MB102     Tanzania ST1741        50 0.781  0.781
## 2 Kilifi117 Kenya    ST1741         5 0.0781 0.859
## 3 aiims21   India    ST1741         3 0.0469 0.906
## 4 aiims13   India    ST1741         2 0.0312 0.938
## 5 MB101     Tanzania ST1741         1 0.0156 0.953
## 6 MB102     Tanzania ST1741-1LV     1 0.0156 0.969
## 7 MB99      Tanzania ST1741         1 0.0156 0.984
## 8 ML75      Malawi   ST245-2LV      1 0.0156 1
data_NNS_sites10 %>% filter(K_locus=="KL157") %>% group_by(Cluster, Country, ST) %>% count() %>% arrange(-n) %>% mutate(p=n/data_NNS_sites10 %>% filter(K_locus=="KL157") %>% nrow()) %>% ungroup() %>% mutate(cum=cumsum(p))
## # A tibble: 1 × 6
##   Cluster                      Country ST         n     p   cum
##   <chr>                        <chr>   <chr>  <int> <dbl> <dbl>
## 1 Kiambu sub-County Hospital42 Kenya   ST6775    29     1     1
data_NNS_sites10 %>% filter(K_locus=="KL81") %>% group_by(Cluster, Country, ST) %>% count() %>% arrange(-n) %>% mutate(p=n/data_NNS_sites10 %>% filter(K_locus=="KL81") %>% nrow()) %>% ungroup() %>% mutate(cum=cumsum(p))
## # A tibble: 9 × 6
##   Cluster Country    ST        n      p   cum
##   <chr>   <chr>      <chr> <int>  <dbl> <dbl>
## 1 CHRF182 Bangladesh ST16     28 0.583  0.583
## 2 AKU15   Pakistan   ST16      7 0.146  0.729
## 3 CHRF126 Bangladesh ST16      6 0.125  0.854
## 4 AKU69   Pakistan   ST16      2 0.0417 0.896
## 5 AKU39   Pakistan   ST16      1 0.0208 0.917
## 6 CHRF165 Bangladesh ST16      1 0.0208 0.938
## 7 CHRF167 Bangladesh ST16      1 0.0208 0.958
## 8 CHRF176 Bangladesh ST16      1 0.0208 0.979
## 9 CHRF178 Bangladesh ST16      1 0.0208 1
data_NNS_sites10 %>% filter(K_locus=="KL108") %>% group_by(Cluster, Country, ST) %>% count() %>% arrange(-n) %>% mutate(p=n/data_NNS_sites10 %>% filter(K_locus=="KL108") %>% nrow()) %>% ungroup() %>% mutate(cum=cumsum(p))
## # A tibble: 8 × 6
##   Cluster Country      ST        n      p   cum
##   <chr>   <chr>        <chr> <int>  <dbl> <dbl>
## 1 BA23    Ethiopia     ST35     32 0.780  0.780
## 2 BA30    Ethiopia     ST35      3 0.0732 0.854
## 3 BA25    Ethiopia     ST35      1 0.0244 0.878
## 4 BA26    Ethiopia     ST35      1 0.0244 0.902
## 5 BA32    Ethiopia     ST35      1 0.0244 0.927
## 6 BA73    Nigeria      ST395     1 0.0244 0.951
## 7 ML173   Malawi       ST110     1 0.0244 0.976
## 8 WI35    South Africa ST35      1 0.0244 1
data_NNS_sites10 %>% filter(K_locus=="KL149") %>% group_by(Cluster, Country, ST) %>% count() %>% arrange(-n) %>% mutate(p=n/data_NNS_sites10 %>% filter(K_locus=="KL149") %>% nrow()) %>% ungroup() %>% mutate(cum=cumsum(p))
## # A tibble: 16 × 6
##    Cluster Country      ST        n      p   cum
##    <chr>   <chr>        <chr> <int>  <dbl> <dbl>
##  1 BG27    South Africa ST152    16 0.327  0.327
##  2 WI20    South Africa ST39     10 0.204  0.531
##  3 BG77    South Africa ST39      5 0.102  0.633
##  4 CHRF139 Bangladesh   ST152     3 0.0612 0.694
##  5 BG16    South Africa ST152     2 0.0408 0.735
##  6 BG37    South Africa ST152     2 0.0408 0.776
##  7 WI43    South Africa ST39      2 0.0408 0.816
##  8 BG40    South Africa ST39      1 0.0204 0.837
##  9 BG72    South Africa ST39      1 0.0204 0.857
## 10 BG84    South Africa ST152     1 0.0204 0.878
## 11 BG93    South Africa ST39      1 0.0204 0.898
## 12 BG96    South Africa ST39      1 0.0204 0.918
## 13 MB73    Ethiopia     ST152     1 0.0204 0.939
## 14 WI15    South Africa ST152     1 0.0204 0.959
## 15 WI17    South Africa ST39      1 0.0204 0.980
## 16 WI36    South Africa ST152     1 0.0204 1
# how do these compare to those added to KL set 2 to get good region coverage? - only KL81
k_rank_region8 %>% left_join(kbayes$locus, by="locus") %>% filter(rank.y>20)
## # A tibble: 5 × 3
##   locus rank.x rank.y
##   <chr>  <int>  <int>
## 1 KL28      16     22
## 2 KL8       17     23
## 3 KL81      18     26
## 4 KL116     19     36
## 5 KL53      20     48

Leave One out (LOO)

Sensitivity analysis - O type, Bayesian estimate, adjusted

loo <- obayes$global_est %>% mutate(Study="None")

loo <- read_csv("outputs_LOO/O_Full_min10_adj_28_LOO_AKU_global_estimates.csv") %>% 
  mutate(Study="AKU") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 15 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/O_Full_min10_adj_28_LOO_BabyGERMS_global_estimates.csv") %>% 
  mutate(Study="BabyGERMS") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 15 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/O_Full_min10_adj_28_LOO_BARNARDS_global_estimates.csv") %>% 
  mutate(Study="BARNARDS") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 14 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/O_Full_min10_adj_28_LOO_Botswana_global_estimates.csv") %>% 
  mutate(Study="NIMBIplus") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 15 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/O_Full_min10_adj_28_LOO_CHRF_global_estimates.csv") %>% 
  mutate(Study="CHRF") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 14 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/O_Full_min10_adj_28_LOO_AIIMS_global_estimates.csv") %>% 
  mutate(Study="DH") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 15 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/O_Full_min10_adj_28_LOO_GBS_global_estimates.csv") %>% 
  mutate(Study="GBSCOP") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 15 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/O_Full_min10_adj_28_LOO_Kilifi_global_estimates.csv") %>% 
  mutate(Study="Kilifi") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 14 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/O_Full_min10_adj_28_LOO_MBIRA_global_estimates.csv") %>% 
  mutate(Study="MBIRA") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 15 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/O_Full_min10_adj_28_LOO_MLW_global_estimates.csv") %>% 
  mutate(Study="MLW") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 14 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/O_Full_min10_adj_28_LOO_NeoBAC_global_estimates.csv") %>% 
  mutate(Study="NeoBAC") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 15 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/O_Full_min10_adj_28_LOO_NeoOBS_global_estimates.csv") %>% 
  mutate(Study="NeoOBS-India") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 15 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo_O <- read_csv("outputs_LOO/O_Full_min10_adj_28_LOO_SPINZ_global_estimates.csv") %>% 
  mutate(Study="SPINZ") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo) %>%
  mutate(locus=replace(locus, locus=="O1.v1", "O1⍺β,2⍺"))  %>%
      mutate(locus=replace(locus, locus=="O1.v2", "O1⍺β,2β"))  %>%
      mutate(locus=replace(locus, locus=="O1.v3", "O1⍺β,2⍺𝛾"))  %>%
      mutate(locus=replace(locus, locus=="O2a.v1", "O2⍺")) %>%
      mutate(locus=replace(locus, locus=="O2a.v3", "O2⍺𝛾")) %>%
      mutate(locus=replace(locus, locus=="O2afg.v2", "O2β")) %>%
      mutate(locus=replace(locus, locus=="O3/O3a", "O3⍺/O3β")) %>%
      mutate(locus=replace(locus, locus=="O3b", "O3𝛾")) %>%
      mutate(locus=replace(locus, locus=="OL13", "O13")) %>%
      mutate(locus=replace(locus, locus=="OL102", "O14")) %>%
      mutate(locus=replace(locus, locus=="OL103", "O10")) %>%
      mutate(locus=replace(locus, locus=="OL104", "O15"))
## Rows: 15 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Bayes global sensitivity analysis, top20 in any, print rank - O type

loo_O_plot <- loo_O %>% 
  filter(rank<=9) %>%
  ggplot(aes(y=locus, x=Study, fill=factor(rank))) +
  geom_tile() + 
  geom_text(aes(y=locus, x=Study, label=round(rank)), size=4) +
  scale_y_discrete(limits=rev(obayes$locus_rank$locus[1:9])) + 
  scale_x_discrete(limits=rev(unique(loo_O$Study))) + # keep the order as per tibble, starting with 'None'
  #scale_fill_gradient("Global\nprevalence (%)", low = "white", high = "#081d58")
  scale_fill_manual(values = c(rev(brewer.pal(5, "Reds")), rev(brewer.pal(6, "Blues")[-1]))) +
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10, 
                                   colour = "black"), 
        axis.title = element_text(size = 10, colour = "black"),
        axis.text.y = element_text(size = 10, colour = "black"),
        plot.title = element_text(hjust=0.5),
        legend.position = "right") +
  labs(y="", x="Excluded study", title="Sensitivity analysis - O types", fill="Rank")

Sensitivity analysis - K locus, Bayesian estimate, adjusted

loo <- kbayes$global_est %>% mutate(Study="None")

loo <- read_csv("outputs_LOO/K_Full_min10_adj_28_LOO_AKU_global_estimates.csv") %>% 
  mutate(Study="AKU") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 87 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/K_Full_min10_adj_28_LOO_BabyGERMS_global_estimates.csv") %>% 
  mutate(Study="BabyGERMS") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 89 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/K_Full_min10_adj_28_LOO_BARNARDS_global_estimates.csv") %>% 
  mutate(Study="BARNARDS") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 85 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/K_Full_min10_adj_28_LOO_Botswana_global_estimates.csv") %>% 
  mutate(Study="NIMBIplus") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 90 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/K_Full_min10_adj_28_LOO_CHRF_global_estimates.csv") %>% 
  mutate(Study="CHRF") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 84 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/K_Full_min10_adj_28_LOO_AIIMS_global_estimates.csv") %>% 
  mutate(Study="DH") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 89 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/K_Full_min10_adj_28_LOO_GBS_global_estimates.csv") %>% 
  mutate(Study="GBSCOP") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 90 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/K_Full_min10_adj_28_LOO_Kilifi_global_estimates.csv") %>% 
  mutate(Study="Kilifi") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 85 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/K_Full_min10_adj_28_LOO_MBIRA_global_estimates.csv") %>% 
  mutate(Study="MBIRA") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 88 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/K_Full_min10_adj_28_LOO_MLW_global_estimates.csv") %>% 
  mutate(Study="MLW") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 87 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/K_Full_min10_adj_28_LOO_NeoBAC_global_estimates.csv") %>% 
  mutate(Study="NeoBAC") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 89 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/K_Full_min10_adj_28_LOO_NeoOBS_global_estimates.csv") %>% 
  mutate(Study="NeoOBS-India") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo)
## Rows: 90 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
loo <- read_csv("outputs_LOO/K_Full_min10_adj_28_LOO_SPINZ_global_estimates.csv") %>% 
  mutate(Study="SPINZ") %>% 
  arrange(-mean) %>%
  mutate(rank = min_rank(desc(mean))) %>% 
  bind_rows(loo) 
## Rows: 90 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (4): mean, median, lower, upper
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

sensitivity analysis for K, top20 in any, print rank

top20_sens <- loo %>% filter(rank<=20) %>% pull(locus) %>% unique()

loo_k_ranks <- loo %>% 
  filter(rank<=20) %>%
  ggplot(aes(y=locus, x=Study, fill=factor(rank))) +
  geom_tile() + 
  geom_text(aes(y=locus, x=Study, label=round(rank)), size=4) +
  scale_y_discrete(limits=rev(kbayes$locus_rank$locus[kbayes$locus_rank$locus %in% top20_sens])) + 
  scale_x_discrete(limits=rev(unique(loo$Study))) + # keep the order as per tibble, starting with 'None'
  #scale_fill_gradient("Global\nprevalence (%)", low = "white", high = "#081d58")
  scale_fill_manual(values = c(rev(brewer.pal(5, "Reds")), rev(brewer.pal(6, "Purples")[-1]), rev(brewer.pal(6, "Blues")[-1]), rev(brewer.pal(5, "Greens")))) +
  theme_bw() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10, 
                                   colour = "black"), 
        axis.title = element_text(size = 10, colour = "black"),
        axis.text.y = element_text(size = 10, colour = "black"),
        plot.title = element_text(hjust=0.5),
        legend.position = "right") +
  labs(y="", x="Excluded study", title="Sensitivity analysis", fill="Rank") +
  geom_hline(yintercept=length(top20_sens)+0.5-5) +
  geom_hline(yintercept=length(top20_sens)+0.5-10) +
  geom_hline(yintercept=length(top20_sens)+0.5-15) +
  geom_hline(yintercept=length(top20_sens)+0.5-20)

LOO - K coverage

studies <- unique(data_NNS_sites10$Study)
studies[c(2,8,12)]
## [1] "NIMBIplus"    "NeoOBS-India" "DH"
studies <- studies[-c(2,8,12)]
studies <- c(studies, c("AIIMS", "NeoOBS", "Botswana")) # filenames

loo_k_coverage <- tibble(
    rank = integer(0),
    mean = double(0),
    median = double(0),
    lower = double(0),
    upper = double(0),
    locus = character(0),
    subgroup = character(0),
    Study = character(0)
  )

for (study in studies) {

# read raw Bayesian estimates for K
loo_kbayes_raw <- parseModelledEstimates(global_post=paste0("outputs_LOO/K_Full_min10_raw_28_LOO_",study,"_posterior_global.csv.gz"), region_post=paste0("outputs_LOO/K_Full_min10_raw_28_LOO_",study,"_posterior_subgroup.csv.gz"))

this_coverage <- getCumulativeCoverageGlobalRegional(loo_kbayes_raw, kbayes$locus_rank) %>%
  mutate(subgroup=fct_relevel(subgroup,c("Global", "Southern Asia", "Eastern Africa", "Southern Africa", "Western Africa"))) %>%
  mutate(Study=study)

loo_k_coverage <- bind_rows(loo_k_coverage, this_coverage)
}
## Rows: 1068000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4272000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 1044000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4176000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 1008000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4032000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 1020000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4080000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 1080000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4320000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 1080000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4320000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 1020000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4080000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 1068000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4272000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 1044000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4176000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 1056000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4224000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 1068000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4272000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 1080000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4320000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 1080000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4320000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
loo_k_coverage_plot <- loo_k_coverage %>% 
  mutate(subgroup=fct_relevel(subgroup,rev(c("Global", "Southern Asia", "Eastern Africa", "Southern Africa", "Western Africa")))) %>%
  bind_rows(kbayes_raw_coverage %>% mutate(Study="None")) %>% 
  filter(rank==20) %>% 
  mutate(Study=if_else(Study=="NeoOBS", "NeoOBS-India", Study)) %>%
  mutate(Study=if_else(Study=="AIIMS", "DH", Study)) %>%
  mutate(Study=if_else(Study=="GBS", "GBSCOP", Study)) %>%
  mutate(Study=if_else(Study=="Botswana", "NIMBIplus", Study)) %>%
  ggplot(aes(x=Study, y=subgroup, fill=mean)) + 
    geom_tile() + 
    geom_text(aes(y=subgroup, x=Study, label=round(100*mean, 2)), size=3) +
    scale_x_discrete(limits=rev(unique(loo$Study))) + 
    scale_fill_gradient(low="white", high="grey") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10, 
                                     colour = "black"), 
          axis.title = element_text(size = 10, colour = "black"),
          axis.text.y = element_text(size = 10, colour = "black"),
          plot.title = element_text(hjust=0.5),
          legend.position = "right") +
    labs(y="", x="Excluded study", fill="Coverage")

Appendix Fig S2.1 - LOO K

loo_k_ranks / loo_k_coverage_plot + patchwork::plot_layout(heights=c(4,1), axes="collect")

ggsave(file="figures/AppendixFigS2.1_LOO_BayesGlobal_Klocus.pdf", width=9, height=11)
ggsave(file="figures/AppendixFigS2.1_LOO_BayesGlobal_Klocus.png", width=9, height=11)
loo_o_coverage <- tibble(
    rank = integer(0),
    mean = double(0),
    median = double(0),
    lower = double(0),
    upper = double(0),
    locus = character(0),
    subgroup = character(0),
    Study = character(0)
  )

for (study in studies) {

# read raw Bayesian estimates for K
loo_obayes_raw <- parseModelledEstimates(global_post=paste0("outputs_LOO/O_Full_min10_raw_28_LOO_",study,"_posterior_global.csv.gz"), region_post=paste0("outputs_LOO/O_Full_min10_raw_28_LOO_",study,"_posterior_subgroup.csv.gz"), fixOnames = T)

this_coverage <- getCumulativeCoverageGlobalRegional(loo_obayes_raw, obayes$locus_rank) %>%
  mutate(subgroup=fct_relevel(subgroup,c("Global", "Southern Asia", "Eastern Africa", "Southern Africa", "Western Africa"))) %>%
  mutate(Study=study)

loo_o_coverage <- bind_rows(loo_o_coverage, this_coverage)
}
## Rows: 180000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 720000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 168000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 672000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 168000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 672000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 168000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 672000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 180000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 720000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 180000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 720000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 168000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 672000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 180000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 720000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 180000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 720000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 180000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 720000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 180000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 720000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 180000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 720000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
## Rows: 180000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 720000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
loo_o_coverage_plot <- loo_o_coverage %>% 
  mutate(subgroup=fct_relevel(subgroup,rev(c("Global", "Southern Asia", "Eastern Africa", "Southern Africa", "Western Africa")))) %>%
  bind_rows(obayes_raw_coverage %>% mutate(Study="None")) %>% 
  filter(rank==5) %>% 
  mutate(Study=if_else(Study=="NeoOBS", "NeoOBS-India", Study)) %>%
  mutate(Study=if_else(Study=="AIIMS", "DH", Study)) %>%
  mutate(Study=if_else(Study=="GBS", "GBSCOP", Study)) %>%
  mutate(Study=if_else(Study=="Botswana", "NIMBIplus", Study)) %>%
  ggplot(aes(x=Study, y=subgroup, fill=mean)) + 
    geom_tile() + 
    geom_text(aes(y=subgroup, x=Study, label=round(100*mean, 2)), size=3) +
    scale_x_discrete(limits=rev(unique(loo$Study))) + 
    scale_fill_gradient(low="lightgrey", high="darkgrey") +
    theme_bw() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 10, 
                                     colour = "black"), 
          axis.title = element_text(size = 10, colour = "black"),
          axis.text.y = element_text(size = 10, colour = "black"),
          plot.title = element_text(hjust=0.5),
          legend.position = "right") +
    labs(y="", x="Excluded study", fill="Coverage")

Appendix Fig S2.2

loo_O_plot / loo_o_coverage_plot + plot_layout(heights=c(2,1), axes="collect")

ggsave(file="figures/AppendixFigS2.2_LOO_BayesGlobal_Olocus.pdf", width=9, height=9, device = cairo_pdf, family = "Arial Unicode MS")
ggsave(file="figures/AppendixFigS2.2_LOO_BayesGlobal_Olocus.png", width=9, height=9)

EFFECT OF TEMPORAL THRESHOLD FOR CLUSTERING ON MODELLED ESTIMATES

compare global prevalence distributions using counts cluster-adjusted using 28 vs 365-day threshold

# read Bayesian estimates for K adjusted using 365 days
kbayes_365 <- parseModelledEstimates(global_post="outputs_core/K_Full_ALL_adj_365_posterior_global.csv.gz", region_post="outputs_core/K_Full_ALL_adj_28_posterior_subgroup.csv.gz")
## Rows: 1080000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 4320000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
# plot raw vs adjusted distributions, overlaid - for fig S15
k_dist_28_365 <- comparative_locus_ridgesplot(posterior1=kbayes$global_post,
                             posterior2=kbayes_365$global_post,
                             ranks=kbayes$locus_rank, 
                             lines_every=10, xlim=c(0,12), xbreaks=seq(0,12,1),
                             maxRank=30, type1="28 days", type2="365 days", legend_title="Cluster threshold", scale=2, title="e) Modelled K estimates - posterior")
## Joining with `by = join_by(locus)`
## Picking joint bandwidth of 0.0752
## Warning: Removed 72 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

# read Bayesian estimates for O adjusted using 365 days
obayes_365 <- parseModelledEstimates(global_post="outputs_core/O_Full_ALL_adj_365_posterior_global.csv.gz", region_post="outputs_core/O_Full_ALL_adj_365_posterior_subgroup.csv.gz", fixOnames = T)
## Rows: 180000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 720000 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): locus, subgroup
## dbl (2): draw_id, prob
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'locus'. You can override using the `.groups` argument.
# plot raw vs adjusted distributions, overlaid - for fig S15
o_dist_28_365 <- comparative_locus_ridgesplot(posterior1=obayes$global_post,
                             posterior2=obayes_365$global_post,
                             ranks=obayes$locus_rank, 
                             lines_every=5, xlim=c(0,30), xbreaks=seq(0,30,2),
                             maxRank=15, type1="28 days", type2="365 days", legend_title="Cluster threshold", scale=2, title="f) Modelled O estimates - posterior")
## Joining with `by = join_by(locus)`
## Picking joint bandwidth of 0.0953
## Warning: Removed 265 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

Scatter plots for K mean and rank

adj_28_vs_365 <- full_join(kbayes$global_est %>% arrange(-mean) %>% ungroup %>% mutate(rank=row_number()), 
          kbayes_365$global_est %>% arrange(-mean) %>% ungroup() %>% mutate(rank=row_number()),
          by="locus", suffix=c(".28", ".365")) 

timeThreshold_estimates_adj <- adj_28_vs_365 %>%
  ggplot(aes(x=mean.28, y=mean.365)) + 
  geom_abline(intercept=0, slope=1, col="grey") +
  geom_point() + theme_bw() + 
  labs(x="28-day threshold", y="365-day threshold", title="c) Mean K estimates")

timeThreshold_ranks_adj <- adj_28_vs_365 %>%
  ggplot(aes(x=rank.28, y=rank.365)) + 
  geom_abline(intercept=0, slope=1, col="grey") +
  geom_point() + theme_bw()  + 
  labs(x="28-day threshold", y="365-day threshold", title="a) Modelled K ranks")

timeThreshold_estimates_adj + timeThreshold_ranks_adj

Scatter plots for O mean and rank

adj_28_vs_365_o <- full_join(obayes$global_est %>% arrange(-mean) %>% ungroup %>% mutate(rank=row_number()), 
          obayes_365$global_est %>% arrange(-mean) %>% ungroup() %>% mutate(rank=row_number()),
          by="locus", suffix=c(".28", ".365")) 

timeThreshold_estimates_adj_o <- adj_28_vs_365_o %>%
  ggplot(aes(x=mean.28, y=mean.365)) + 
  geom_abline(intercept=0, slope=1, col="grey") +
  geom_point() + theme_bw() + 
  labs(x="28-day threshold", y="365-day threshold", title="d) Mean O estimates")

timeThreshold_ranks_adj_o <- adj_28_vs_365_o %>%
  ggplot(aes(x=rank.28, y=rank.365)) + 
  geom_abline(intercept=0, slope=1, col="grey") +
  geom_point() + theme_bw()  + 
  labs(x="28-day threshold", y="365-day threshold", title="b) Modelled O ranks")

timeThreshold_estimates_adj_o + timeThreshold_ranks_adj_o

Appendix Fig S2.5

(timeThreshold_ranks_adj + timeThreshold_ranks_adj_o) / (timeThreshold_estimates_adj + timeThreshold_estimates_adj_o) / (k_dist_28_365 + o_dist_28_365 + patchwork::plot_layout(guides="collect")) + patchwork::plot_layout(heights=c(1,1,2)) & theme(legend.position='bottom')
## Picking joint bandwidth of 0.0752
## Warning: Removed 72 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
## Picking joint bandwidth of 0.0953
## Warning: Removed 265 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).

ggsave("figures/AppendixFigS2.5_temporalThresholdModelled.pdf", width=8, height=10, device = cairo_pdf, family = "Arial Unicode MS")
## Picking joint bandwidth of 0.0752
## Warning: Removed 72 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
## Picking joint bandwidth of 0.0953
## Warning: Removed 265 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
ggsave("figures/AppendixFigS2.5_temporalThresholdModelled.png", width=8, height=10)
## Picking joint bandwidth of 0.0752
## Warning: Removed 72 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).
## Picking joint bandwidth of 0.0953
## Warning: Removed 265 rows containing non-finite outside the scale range
## (`stat_density_ridges()`).